This is an R Markdown Notebook for analysis using data on the DC Bus System (WMATA Metrobus). The data were obtained here:

https://planitmetro.com/2016/11/16/data-download-metrobus-vehicle-location-data/

Control + Alt + Shift + m = rename in scope

Load the packages to be used.

package ‘maptools’ was built under R version 3.2.5

Get the Bus data.

First let’s check the working directory.

getwd()
[1] "/Users/mdturse/Desktop/Analytics/DCMetroBus"

Then, actually get the data.

The working directory was changed to /Users/mdturse/Desktop/Analytics/DCMetroBus/Bus AVL Oct 2016 inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the the working directory for notebook chunks.cannot open file '20161003MetrobusAVL.txt': No such file or directoryError in file(file, "rt") : cannot open the connection

Put the daily data together.


AllDays <- bind_rows(list(Oct03Raw, Oct04Raw, Oct05Raw, Oct06Raw, Oct07Raw),
                     .id = c("group")
                    )
# dim(AllDays)
str(AllDays)

Deleting old data frames.


for (i in 3:7){
  rm(list = ls(pattern = paste0("Oct0", i, "Raw")
              )
    )
  
  message("Deleting Oct0", i, "Raw")
  }

Updating variable types.

Then, sorting the data and adding a RowNumber (to be used for identifying rows later in the analyses.)


rm(i)


AllDays$group <- factor(AllDays$group)
AllDays$Route_Direction <- factor(AllDays$Route_Direction)
AllDays$Event_Time <- as.POSIXct(AllDays$Event_Time, format = "%m-%d-%y %I:%M:%S %p")
AllDays$Departure_Time <- as.POSIXct(AllDays$Departure_Time, format = "%m-%d-%y %I:%M:%S %p")

str(AllDays)


AllDays_Sorted <- arrange(AllDays,
                          Bus_ID,
                          Event_Time
                         ) %>% 
  mutate(RowNum_OG = row_number() # this is useful in identify the row later on
        )

rm(AllDays)
str(AllDays_Sorted)

# View(head(AllDays_Sorted, 100))

Inspecting the values of Stop_ID, and finding that it can take the values “” (blank) and “NULL”.


View(group_by(AllDays_Sorted,
              Stop_ID
             ) %>% 
       summarise(
         Cnt = n()
         ) %>% 
       arrange(Stop_ID)
    )

View(filter(AllDays_Sorted,
            is.na(Stop_ID) |
              Stop_ID == "" |
              Stop_ID == "NULL"
           ) %>% 
       arrange(Stop_Desc)
    )

Creating a table of distinct Stop_Desc values when Stop_ID is “” (blank) or “NULL”.


StopID_New <- filter(AllDays_Sorted,
                     is.na(Stop_ID) |
                       Stop_ID == "" |
                       Stop_ID == "NULL"
                    ) %>% 
  select(Stop_ID, Stop_Desc) %>% 
  distinct() %>% 
  arrange(Stop_ID, Stop_Desc) %>% 
  mutate(StopID_New = 1:nrow(.)
        )

View(StopID_New)

Creating a full updated table by filling in StopID_New for when Stop_ID is “” (blank) or NULL.


AllDays_StopIDNew <- left_join(AllDays_Sorted,
                               select(StopID_New,
                                      Stop_Desc,
                                      StopID_New
                                     ),
                               by = c("Stop_Desc" = "Stop_Desc")
                              ) %>% 
  mutate(StopID_Clean = ifelse(is.na(StopID_New),
                               Stop_ID,
                               StopID_New
                              ),
         StopID_Indicator = factor(ifelse(is.na(StopID_New),
                                          "ID_OK",
                                          "ID_Bad"
                                         )
                                  )
        )

rm(StopID_New)
rm(AllDays_Sorted)
str(AllDays_StopIDNew)

# View(tail(AllDays_StopIDNew, 500))
# View(filter(AllDays_StopIDNew,
#             Stop_Desc == "METROWAY ANNNOUCEMNT CORR"
#            )
#     )

Lat Long stats for pulling in Zip codes later.


LL_Stats <- group_by(AllDays_StopIDNew,
                     StopID_Clean
                    ) %>% 
  summarise(Lat_Mean = mean(Latitude, na.rm = TRUE),
            Lat_Med = median(Latitude, na.rm = TRUE),
            Lng_Mean = mean(Longitude, na.rm = TRUE),
            Lng_Med = median(Longitude, na.rm = TRUE)
           ) %>% 
  mutate(Lat_MeaLessMed = Lat_Mean - Lat_Med,
         Lng_MeaLessMed = Lng_Mean - Lng_Med,
         RowNum = row_number()
        )

str(LL_Stats)
summary(LL_Stats)

View(head(arrange(LL_Stats,
                  Lat_MeaLessMed
                 ),
          500
         )
    )

View(head(arrange(LL_Stats,
                  desc(Lat_MeaLessMed)
                 ),
          500
         )
    )

View(head(arrange(LL_Stats,
                  Lng_MeaLessMed
                 ),
          500
         )
    )

View(head(arrange(LL_Stats,
                  desc(Lng_MeaLessMed)
                 ),
          500
         )
    )

Pulling in Zip Code data from api.geonames.org.


# URL EXAMPLE:
# http://api.geonames.org/findNearbyPostalCodesJSON?lat=38.89560&lng=-76.94873&radius=0&username=supermdat

url_1 <- "http://api.geonames.org/findNearbyPostalCodesJSON?lat="
url_2 <- "&lng="
url_3 <- "&radius=0&username="
username <- "supermdat"


# need to group in bunches as http://api.geonames.org limits pulls to 2000 per hour


##### Store everything in multiple lists
pages1 <- list()


system.time(
#   for(j in 0:5){
#   for(k in ((max_row_per_hr*j) + 1):(max_row_per_hr*(j+1)
#                                     )
#      ){
#     
#   }
# }
for(i in 1:1000){
  lat <- filter(LL_Stats,
                RowNum == i
               ) %>%
    select(Lat_Med)
  
  lng <- filter(LL_Stats,
                RowNum == i
               ) %>%
    select(Lng_Med)
  
  APIData1 <- fromJSON(paste0(url_1,
                              lat,
                              url_2,
                              lng,
                              url_3,
                              username
                             ),
                       flatten = TRUE
                      )
  
  message("Retrieving Zip Code ", i)
  
  pages1[[i]] <- APIData1$postalCodes
  
  # Sys.sleep(3900)
}
)

# class(APIData1)
# class(APIData2$postalCodes)
# str(APIData1)
# head(APIData1)
# class(pages1)
# str(pages1)
# nrow(page1)
# ncol(page1)
# pages1[[1199]]
# pages1[[2051]]


##### Combine the lists into one page
Zips1 <- rbind.pages(pages1[sapply(pages1, length) > 0])


##### Combine all pages
Zips_All <- bind_rows(Zips0,
                      Zips1,
                      Zips2,
                      Zips3,
                      Zips4,
                      Zips5,
                      Zips6,
                      Zips7,
                      Zips8,
                      Zips9,
                      Zips10,
                      # Zips1_a,
                      .id = "id"
                     ) %>% 
  mutate(UniqueLatLng = paste(lat, lng, sep = "__")
        )

# str(Zips_All)
# View(head(Zips_All))


# str(LL_Stats)
LL_Stats_UnqLatLng <- mutate(LL_Stats,
                             UniqueLatLng = paste(Lat_Med, Lng_Med, sep = "__")
                            )

# str(LL_Stats_UnqLatLng)
# View(head(LL_Stats_UnqLatLng))


LL_StatsZips <- left_join(LL_Stats_UnqLatLng,
                          Zips_All,
                          by = c("UniqueLatLng" = "UniqueLatLng")
                         )

str(LL_StatsZips)
# View(head(LL_StatsZips))

# Not sure whey these couldn't be found (why they're NA)
View(filter(LL_StatsZips,
            is.na(postalCode)
           )
    )

Join to create one dataset that also includes Zip variables.


rm(url_1, url_2, url_3, username, pages0, pages1, pages2, pages3, pages4, pages5, pages6, pages7, pages8, pages9, pages10, i, lat, lng, APIData0, APIData1, APIData2, APIData3, APIData4, APIData5, APIData6, APIData7, APIData8, APIData9, APIData10, LL_Stats, LL_Stats_UnqLatLng)


AllDays_Zips <- left_join(AllDays_StopIDNew,
                          LL_StatsZips,
                          by = c("StopID_Clean" = "StopID_Clean")
                         ) %>% 
  rename(Stop_State = adminCode1,
         Stop_County = adminName2,
         Stop_City = placeName,
         Stop_Zip = postalCode
         )

rm(AllDays_StopIDNew, LL_StatsZips)
str(AllDays_Zips)

Updating variable types.


AllDays_Zips$Stop_State <- factor(AllDays_Zips$Stop_State)
AllDays_Zips$Stop_County <- factor(AllDays_Zips$Stop_County)
AllDays_Zips$Stop_Zip <- factor(AllDays_Zips$Stop_Zip)
AllDays_Zips$Stop_City <- factor(AllDays_Zips$Stop_City)

AllDays_Zips$distance <- as.numeric(AllDays_Zips$distance)
AllDays_Zips$countryCode <- factor(AllDays_Zips$countryCode)
AllDays_Zips$adminName1 <- factor(AllDays_Zips$adminName1)

str(AllDays_Zips)

Feature engineering.

Inspecting incidences of consecutive Stop_IDs. This is done because investigation showed that many conseutive events occurr at the same Stop_ID, but with various Dwell_Times, Odometer_Distances, etc. All of which affect calculations and analyses.

Create data on the runs (consecutive Stop_IDs).


StopID_Runs <- rle(AllDays_Zips$StopID_Clean)

StopID_Runs$ends <- cumsum(StopID_Runs$lengths)

StopID_Runs$starts <- ifelse(is.na(lag(StopID_Runs$ends)
                                  ),
                             1,
                             lag(StopID_Runs$ends) + 1
                            )

str(StopID_Runs)
# class(StopID_Runs)
# 
# StopID_Runs_df <- data.frame(unclass(StopID_Runs))
# str(StopID_Runs_df)
# class(StopID_Runs_df)
# rm(StopID_Runs_df)

Trying to link data on RunsGroups with the original data (AllDays_Sorted). The goal is to select only one record per RunsGroup - that being the record with the longest Dwell_Time.

I attempted this computation using both data.frames (dplyr) and data.tables (data.table). However, with 2,809,062 rows in one dataset and 3,119,443 rows in the other dataset, the current computation time is over 5 days…so I’m trying a different strategy to only select the first record in a run.


# Create a RunsGroup variable for each run
# StopID_Runs_df$RunsGroup <- paste0("g", seq(1:nrow(StopID_Runs_df)
#                                            )
#                                   )
# 
# str(StopID_Runs_df)
# head(StopID_Runs_df, 25)
# tail(StopID_Runs_df, 25)
# 
# StopID_Runs_df <- StopID_Runs_df %>% 
#   mutate(RowNum = row_number()
#         )
# 
# str(StopID_Runs_df)
# head(StopID_Runs_df, 25)
# tail(StopID_Runs_df, 25)
# 
# 
# # Converting to data.tables for, hopefully, improved performance (speed) in computation
# StopID_Runs_dt <- data.table(StopID_Runs_df)
# setkey(StopID_Runs_dt, RowNum)
# str(StopID_Runs_dt)
# 
# AllDays_Sorted_dt <- data.table(AllDays_Sorted)
# setkey(AllDays_Sorted_dt, RowNum_OG)
# str(AllDays_Sorted_dt)
# # rm(AllDays_Sorted_dt)
# 
# 
# # Actual loop to perform the computations and link to original data (AllDays_Sorted_dt)
# GroupData <- list()
# for(i in 1:nrow(StopID_Runs_dt)
#    ) {
#   assign(paste0("group_", i),
#            StopID_Runs_dt[RowNum == i, RunsGroup]
#           )
# 
#     #####  The code below is the same code as above, but done with dplyr  #####
# 
#     # assign(paste0("group_", i),
#   #        filter(StopID_Runs_df,
#   #               RowNum == i
#   #              ) %>% 
#   #          select(RunsGroup)
#   #       )
# 
#   assign(paste0("group_", i, "_start"),
#          StopID_Runs_dt[RowNum == i, starts]
#         )
# 
#   assign(paste0("group_", i, "_end"),
#          StopID_Runs_dt[RowNum == i, ends]
#         )
# 
#   assign(paste0("group_", i, "_rows"),
#          AllDays_Sorted_dt[RowNum_OG >= as.numeric(get(paste0("group_", i, "_start")
#                                                       )
#                                                   ) &
#                            RowNum_OG <= as.numeric(get(paste0("group_", i, "_end")
#                                                       )
#                                                   ),
#                            RunsGroup := as.character(get(paste0("group_", i)
#                                                         )
#                                                     )
#                           ]
# 
#     #####  The code below is the same as the code above, but done with dplyr  #####
# 
#          # filter(AllDays_Sorted,
#          #        between(RowNum_OG,
#          #                as.numeric(get(paste0("group_", i, "_start")
#          #                              )
#          #                          ),
#          #                as.numeric(get(paste0("group_", i, "_end")
#          #                              )
#          #                          )
#          #               )
#          #       ) %>% 
#          #   mutate(RunsGroup = as.character(get(paste0("group_", i)
#          #                                     )
#          #                                 )
#          #        )
#         )
# 
#   GroupData[[i]] <- get(paste0("group_", i, "_rows"))
# 
#   message("Processing Group ", i, " of 2,809,062")
# }
# 
# 
# GroupData_df <- rbind.fill(GroupData)
# str(GroupData_df)
# head(GroupData_df)
# tail(GroupData_df)
# # rm(GroupData_df)
# 
# 
# group_1
# group_1_start
# group_1_end
# group_1_rows
# group_2_rows
# group_3_rows
# group_50_rows
# str(group_50_rows)
# group_2809062_rows
# GroupData[[1]]
# GroupData[[50]]
# 
# 
# #####  Testing Area (Below)  #####
# #####  Testing Area (Below)  #####
# #####  Testing Area (Below)  #####
# 
# # head(StopID_Runs$starts, 20)
# # head(AllDays_NewOrder$Stop_ID, 20)
# # 
# # 
# # dat <- as.data.frame(c(1,1,7,7,7,9,6,8,2,2,2,1,1,1,1,1))
# # colnames(dat)[1] <- "dat"
# # r <- rle(dat$dat)
# # dat$run <- rep(r$lengths, r$lengths)
# # dat$runLag <- lag(dat$run)
# # dat$cond <- rep(r$values, r$lengths)
# # dat
# # View(dat)

When consecutive Stop_ID occurrs, only take the first occurrence. This is done because the computation time to select only the record with the longest Dwell_Time for each run was too long (over 5 days).

This is probably less than ideal with regards to Dwell_Time, but should not make much difference for calculations of travel time, speed, etc.


AllDays_FirstStopID <- AllDays_Zips[StopID_Runs$starts, ]

dim(AllDays_Zips)
dim(AllDays_FirstStopID)

nrow(AllDays_Zips) - nrow(AllDays_FirstStopID)

rm(AllDays_Zips, StopID_Runs)
str(AllDays_FirstStopID)

Feature engineering.

Creating new variables.


AllDays_AddVars <- mutate(AllDays_FirstStopID,
                          Odometer_Distance_Mi = Odometer_Distance / 5280, #5,280 feet in 1 mile
                          Dwell_Time2 = as.numeric(Departure_Time - Event_Time),
                          Event_Time_Yr = as.integer(year(Event_Time)),
                          Event_Time_Mth = as.integer(month(Event_Time)),
                          Event_Time_Date = day(Event_Time),
                          Event_Time_Day = wday(Event_Time, label = TRUE),
                          Event_Time_Hr = hour(Event_Time),
                          Event_Time_Min = minute(Event_Time),
                          Event_Time_HrGroup = factor(ifelse(Event_Time_Hr < 3,
                                                             "Group0_2",
                                                      ifelse(Event_Time_Hr < 6,
                                                             "Group3_5",
                                                      ifelse(Event_Time_Hr < 9,
                                                             "Group6_8",
                                                      ifelse(Event_Time_Hr < 12,
                                                             "Group9_11",
                                                      ifelse(Event_Time_Hr < 15,
                                                             "Group12_14",
                                                      ifelse(Event_Time_Hr < 18,
                                                             "Group15_17",
                                                      ifelse(Event_Time_Hr < 21,
                                                             "Group18_20",
                                                      ifelse(Event_Time_Hr < 24,
                                                             "Group21_23"
                                                            )))))))),
                                                         levels = c("Group0_2",
                                                                    "Group3_5",
                                                                    "Group6_8",
                                                                    "Group9_11",
                                                                    "Group12_14",
                                                                    "Group15_17",
                                                                    "Group18_20",
                                                                    "Group21_23"
                                                                   ),
                                                         ordered = TRUE
                                                     )
                         )

rm(AllDays_FirstStopID)
str(AllDays_AddVars)


# group_by(AllDays_AddVars,
#          Event_Time_HrGroup
#         ) %>% 
#   summarise(Cnts = n()
#            )


# View(head(filter(AllDays_AddVars,
#                  Event_Time_Hr == 0
#                 ),
#           50
#          )
#     )

# View(head(AllDays_AddVars, 50))

Feature engineering.

Creating more variables. Creating a BusEvent row number for future identification purposes. Then, creating various variables to analyze distance traveled and speed.


AllDays_BusDay <- group_by(AllDays_AddVars,
                           Bus_ID,
                           Event_Time_Date
                          ) %>% 
  mutate(BusDay_EventNum = row_number(),  # used to identify Bus movements on a particular date
         
         Route_Lag1 = lag(Route),  # used in future analyses to identify Route changes
         RouteAlt_Lag1 = lag(RouteAlt),  # used in future analyses to identify RouteAlt (direction) changes
         
         Odometer_Distance_Lag1 = lag(Odometer_Distance),
         
         Latitude_L1 = lag(Latitude),
         Longitude_L1 = lag(Longitude),
         # Lat_Radian = Latitude*pi/180,
         # Long_Radian = Longitude*pi/180,
         # Lat_Radian_L1 = lag(Lat_Radian),
         # Long_Radian_L1 = lag(Long_Radian),
         
         # accounting for potential negative distances
         TravelDistance_Ft = ifelse(Odometer_Distance > Odometer_Distance_Lag1,
                                    Odometer_Distance - Odometer_Distance_Lag1,
                                    NA
                                   ),
         TravelDistance_Mi = TravelDistance_Ft / 5280, #5,280 feet in 1 mile
         
         # TravelDistance_Mi2 = gcd.hf(long1 = Long_Radian_L1,
         #                             lat1 = Lat_Radian_L1,
         #                             long2 = Long_Radian,
         #                             lat2 = Lat_Radian
         #                            ),
         
         TravelDistance_Mi_Hvrs = 
                              # ifelse((is.na(Longitude_L1) | is.na(Latitude_L1)
                              #        ),
                              #        NA,
                              distHaversine(cbind(Longitude_L1, Latitude_L1),
                                            cbind(Longitude, Latitude)
                                           ) * 0.000621371, # 0.000621371 miles = 1 meter
         
         # accounting for potential negative times
         TravelTime_Sec = as.numeric(ifelse(Event_Time > lag(Departure_Time),
                                            Event_Time - lag(Departure_Time),
                                            NA
                                           )
                                    ),
         TravelTime_Hr = TravelTime_Sec / 3600, # 3,600 seconds in 1 hour
         
         # accounting for potential negative or zero travel times
         SpeedAvg_Mph = ifelse(TravelTime_Hr > 0,
                               TravelDistance_Mi / TravelTime_Hr,
                               NA
                              ),
         
         Start_ID = lag(StopID_Clean),
         Start_Desc = lag(Stop_Desc),
         StartStop_ID = ifelse(is.na(Start_ID),
                               paste("NULL", StopID_Clean, sep = "--"),
                               paste(Start_ID, StopID_Clean, sep = "--")
                              )
        ) %>% 
  as.data.frame()


rm(AllDays_AddVars)
str(AllDays_BusDay)

# summary(AllDays_BusDay)

# View(tail(AllDays_BusDay, 50))

Inspecting for issues with StartStop_ID (where the value is either NA or contains NULL). They ONLY exist when BusDay_EventNum = 1 (which is by design). So everything looks OK.


View(group_by(AllDays_BusDay,
              StartStop_ID
             ) %>% 
       summarise(
         Cnt = n()
       ) %>% 
       arrange(desc(Cnt)
              )
    )

View(filter(AllDays_BusDay,
            (is.na(StartStop_ID) |
              str_detect(StartStop_ID, "NULL")
            ) &
              BusDay_EventNum != 1
           )
    )

Stats (quantiles) overall for TravelDistance_Mi.


Quantiles_dt <- AllDays_BusDay %>% 
  mutate(TD_Mi_q2 = quantile(x = TravelDistance_Mi, probs = 0.02, na.rm = TRUE),
         TD_Mi_q98 = quantile(x = TravelDistance_Mi, probs = 0.98, na.rm = TRUE),
         TT_Sec_q2 = quantile(x = TravelTime_Sec, probs = 0.02, na.rm = TRUE),
         TT_Sec_q98 = quantile(x = TravelTime_Sec, probs = 0.98, na.rm = TRUE),
         TT_Hr_q2 = quantile(x = TravelTime_Hr, probs = 0.02, na.rm = TRUE),
         TT_Hr_q98 = quantile(x = TravelTime_Hr, probs = 0.98, na.rm = TRUE)
        ) %>% 
  data.table()


Stats <- Quantiles_dt %>% 
  mutate(TD_Mi_Mean = mean(TravelDistance_Mi, na.rm = TRUE),
         TD_Mi_Mean_F = mean(TravelDistance_Mi[TD_Mi_q2 <= TravelDistance_Mi & TravelDistance_Mi <= TD_Mi_q98],
                             na.rm = TRUE
                            ),
         TD_Mi_Med = median(TravelDistance_Mi, na.rm = TRUE),
         TD_Mi_Med_F = median(TravelDistance_Mi[TD_Mi_q2 <= TravelDistance_Mi & TravelDistance_Mi <= TD_Mi_q98],
                              na.rm = TRUE
                             ),
         TD_Mi_Cnt = sum(!is.na(TravelDistance_Mi)
                        ),
         TD_Mi_Cnt_F = sum(!is.na(TravelDistance_Mi[TD_Mi_q2 <= TravelDistance_Mi & TravelDistance_Mi <= TD_Mi_q98]
                                 )
                          ),
            
         TT_Sec_Mean = mean(TravelTime_Sec, na.rm = TRUE),
         TT_Sec_Mean_F = mean(TravelTime_Sec[TT_Sec_q2 <= TravelTime_Sec & TravelTime_Sec <= TT_Sec_q98],
                              na.rm = TRUE
                             ),
         TT_Sec_Med = median(TravelTime_Sec, na.rm = TRUE),
         TT_Sec_Med_F = median(TravelTime_Sec[TT_Sec_q2 <= TravelTime_Sec & TravelTime_Sec <= TT_Sec_q98],
                               na.rm = TRUE
                              ),
         TT_Sec_Cnt = sum(!is.na(TravelTime_Sec)
                         ),
         TT_Sec_Cnt_F = sum(!is.na(TravelTime_Sec[TT_Sec_q2 <= TravelTime_Sec & TravelTime_Sec <= TT_Sec_q98]
                                   )
                           ),

         TT_Hr_Mean = mean(TravelTime_Hr, na.rm = TRUE),
         TT_Hr_Mean_F = mean(TravelTime_Hr[TT_Hr_q2 <= TravelTime_Hr & TravelTime_Hr <= TT_Hr_q98],
                             na.rm = TRUE
                            ),
         TT_Hr_Med = median(TravelTime_Hr, na.rm = TRUE),
         TT_Hr_Med_F = median(TravelTime_Hr[TT_Hr_q2 <= TravelTime_Hr & TravelTime_Hr <= TT_Hr_q98],
                              na.rm = TRUE
                             ),
         TT_Hr_Cnt = sum(!is.na(TravelTime_Hr)
                        ),
         TT_Hr_Cnt_F = sum(!is.na(TravelTime_Hr[TT_Hr_q2 <= TravelTime_Hr & TravelTime_Hr <= TT_Hr_q98]
                                 )
                          )
        ) %>% 
  data.frame()

rm(AllDays_BusDay)
rm(Quantiles_dt)
str(Stats)
# View(head(Stats, 50))

Stats for StartStop_ID.


Quantiles_SS_dt <- group_by(Stats,
                            StartStop_ID
                           ) %>% 
  mutate(TD_Mi_SS_q5 = quantile(x = TravelDistance_Mi, probs = 0.05, na.rm = TRUE),
         TD_Mi_SS_q95 = quantile(x = TravelDistance_Mi, probs = 0.95, na.rm = TRUE),
         TT_Sec_SS_q5 = quantile(x = TravelTime_Sec, probs = 0.05, na.rm = TRUE),
         TT_Sec_SS_q95 = quantile(x = TravelTime_Sec, probs = 0.95, na.rm = TRUE),
         TT_Hr_SS_q5 = quantile(x = TravelTime_Hr, probs = 0.05, na.rm = TRUE),
         TT_Hr_SS_q95 = quantile(x = TravelTime_Hr, probs = 0.95, na.rm = TRUE)
        ) %>% 
  data.table()


Stats_StSt <- group_by(Quantiles_SS_dt,
                       StartStop_ID
                      ) %>% 
  mutate(TD_Mi_SS_Mean = mean(TravelDistance_Mi, na.rm = TRUE),
         TD_Mi_SS_Mean_F = mean(TravelDistance_Mi[TD_Mi_SS_q5 <= TravelDistance_Mi & TravelDistance_Mi <= TD_Mi_SS_q95],
                                na.rm = TRUE
                               ),
         TD_Mi_SS_Med = median(TravelDistance_Mi, na.rm = TRUE),
         TD_Mi_SS_Med_F = median(TravelDistance_Mi[TD_Mi_SS_q5 <= TravelDistance_Mi & TravelDistance_Mi <= TD_Mi_SS_q95],
                                 na.rm = TRUE
                                ),
         TD_Mi_SS_Cnt = sum(!is.na(TravelDistance_Mi)
                           ),
         TD_Mi_SS_Cnt_F = sum(!is.na(TravelDistance_Mi[TD_Mi_SS_q5 <= TravelDistance_Mi & TravelDistance_Mi <= TD_Mi_SS_q95]
                                    )
                             ),
            
         TT_Sec_SS_Mean = mean(TravelTime_Sec, na.rm = TRUE),
         TT_Sec_SS_Mean_F = mean(TravelTime_Sec[TT_Sec_SS_q5 <= TravelTime_Sec & TravelTime_Sec <= TT_Sec_SS_q95],
                                 na.rm = TRUE
                                ),
         TT_Sec_SS_Med = median(TravelTime_Sec, na.rm = TRUE),
         TT_Sec_SS_Med_F = median(TravelTime_Sec[TT_Sec_SS_q5 <= TravelTime_Sec & TravelTime_Sec <= TT_Sec_SS_q95],
                                  na.rm = TRUE
                                 ),
         TT_Sec_SS_Cnt = sum(!is.na(TravelTime_Sec)),
         TT_Sec_SS_Cnt_F = sum(!is.na(TravelTime_Sec[TT_Sec_SS_q5 <= TravelTime_Sec & TravelTime_Sec <= TT_Sec_SS_q95]
                                     )
                              ),

         TT_Hr_SS_Mean = mean(TravelTime_Hr, na.rm = TRUE),
         TT_Hr_SS_Mean_F = mean(TravelTime_Hr[TT_Hr_SS_q5 <= TravelTime_Hr & TravelTime_Hr <= TT_Hr_SS_q95],
                                na.rm = TRUE
                               ),
         TT_Hr_SS_Med = median(TravelTime_Hr, na.rm = TRUE),
         TT_Hr_SS_Med_F = median(TravelTime_Hr[TT_Hr_SS_q5 <= TravelTime_Hr & TravelTime_Hr <= TT_Hr_SS_q95],
                                 na.rm = TRUE
                                ),
         TT_Hr_SS_Cnt = sum(!is.na(TravelTime_Hr)),
         TT_Hr_SS_Cnt_F = sum(!is.na(TravelTime_Hr[TT_Hr_SS_q5 <= TravelTime_Hr & TravelTime_Hr <= TT_Hr_SS_q95]
                                    )
                             )
        ) %>% 
  data.frame()

rm(Stats)
rm(Quantiles_SS_dt)
str(Stats_StSt)
# View(head(Stats_StSt, 50))

Stats for StartStop_ID with Event_Time_HrGroup.


Quantiles_SSHG_dt <- group_by(Stats_StSt,
                              StartStop_ID,
                              Event_Time_HrGroup
                             ) %>% 
  mutate(TD_Mi_SSHG_q5 = quantile(x = TravelDistance_Mi, probs = 0.05, na.rm = TRUE),
         TD_Mi_SSHG_q95 = quantile(x = TravelDistance_Mi, probs = 0.95, na.rm = TRUE),
         TT_Sec_SSHG_q5 = quantile(x = TravelTime_Sec, probs = 0.05, na.rm = TRUE),
         TT_Sec_SSHG_q95 = quantile(x = TravelTime_Sec, probs = 0.95, na.rm = TRUE),
         TT_Hr_SSHG_q5 = quantile(x = TravelTime_Hr, probs = 0.05, na.rm = TRUE),
         TT_Hr_SSHG_q95 = quantile(x = TravelTime_Hr, probs = 0.95, na.rm = TRUE)
        ) %>% 
  data.table()


Stats_StSt_HrGrp <- group_by(Quantiles_SSHG_dt,
                             StartStop_ID,
                             Event_Time_HrGroup
                            ) %>% 
  mutate(TD_Mi_SSHG_Mean = mean(TravelDistance_Mi, na.rm = TRUE),
         TD_Mi_SSHG_Mean_F = mean(TravelDistance_Mi[TD_Mi_SSHG_q5 <= TravelDistance_Mi & TravelDistance_Mi <= TD_Mi_SSHG_q95],
                                  na.rm = TRUE
                                 ),
         TD_Mi_SSHG_Med = median(TravelDistance_Mi, na.rm = TRUE),
         TD_Mi_SSHG_Med_F = median(TravelDistance_Mi[TD_Mi_SSHG_q5 <= TravelDistance_Mi & TravelDistance_Mi <= TD_Mi_SSHG_q95],
                                   na.rm = TRUE
                                  ),
         TD_Mi_SSHG_Cnt = sum(!is.na(TravelDistance_Mi)
                             ),
         TD_Mi_SSHG_Cnt_F = sum(!is.na(TravelDistance_Mi[TD_Mi_SSHG_q5 <= TravelDistance_Mi & TravelDistance_Mi <= TD_Mi_SSHG_q95]
                                      )
                               ),
            
         TT_Sec_SSHG_Mean = mean(TravelTime_Sec, na.rm = TRUE),
         TT_Sec_SSHG_Mean_F = mean(TravelTime_Sec[TT_Sec_SSHG_q5 <= TravelTime_Sec & TravelTime_Sec <= TT_Sec_SSHG_q95],
                                   na.rm = TRUE
                                  ),
         TT_Sec_SSHG_Med = median(TravelTime_Sec, na.rm = TRUE),
         TT_Sec_SSHG_Med_F = median(TravelTime_Sec[TT_Sec_SSHG_q5 <= TravelTime_Sec & TravelTime_Sec <= TT_Sec_SSHG_q95],
                                    na.rm = TRUE
                                   ),
         TT_Sec_SSHG_Cnt = sum(!is.na(TravelTime_Sec)),
         TT_Sec_SSHG_Cnt_F = sum(!is.na(TravelTime_Sec[TT_Sec_SSHG_q5 <= TravelTime_Sec & TravelTime_Sec <= TT_Sec_SSHG_q95]
                                       )
                                ),

         TT_Hr_SSHG_Mean = mean(TravelTime_Hr, na.rm = TRUE),
         TT_Hr_SSHG_Mean_F = mean(TravelTime_Hr[TT_Hr_SSHG_q5 <= TravelTime_Hr & TravelTime_Hr <= TT_Hr_SSHG_q95],
                                  na.rm = TRUE
                                 ),
         TT_Hr_SSHG_Med = median(TravelTime_Hr, na.rm = TRUE),
         TT_Hr_SSHG_Med_F = median(TravelTime_Hr[TT_Hr_SSHG_q5 <= TravelTime_Hr & TravelTime_Hr <= TT_Hr_SSHG_q95],
                                   na.rm = TRUE
                                  ),
         TT_Hr_SSHG_Cnt = sum(!is.na(TravelTime_Hr)),
         TT_Hr_SSHG_Cnt_F = sum(!is.na(TravelTime_Hr[TT_Hr_SSHG_q5 <= TravelTime_Hr & TravelTime_Hr <= TT_Hr_SSHG_q95]
                                      )
                               )
        ) %>% 
  data.frame()

rm(Stats_StSt)
rm(Quantiles_SSHG_dt)
str(Stats_StSt_HrGrp)
# View(head(Stats_StSt_HrGrp, 50))

Feature engineering.

Creating a BusEventRoute row number, and a RouteAlt_Lag1 indicator for future identification purposes.


# rm(Quantiles_dt)
# rm(Quantiles_SS_dt)
# rm(AllDays_BusDay)
# rm(Quantiles_SSHG_dt)
# rm(Stats_StSt)

# AllDays_BusDayRoute <- group_by(Stats_StSt_HrGrp,
#                                 Bus_ID,
#                                 Event_Time_Date,
#                                 Route
#                                ) %>% 
#   mutate(RouteAlt_Lag2 = lag(RouteAlt)  # used in future analyses to identify RouteAlt (direction) changes
#          
#          # Odometer_Distance_Lag1 = lag(Odometer_Distance),
#          # 
#          # # accounting for potential negative distances
#          # TravelDistance_Ft = ifelse(Odometer_Distance >= Odometer_Distance_Lag1,
#          #                            Odometer_Distance - Odometer_Distance_Lag1,
#          #                            NA
#          #                           ),
#          # TravelDistance_Mi = TravelDistance_Ft / 5280, #5,280 feet in 1 mile
#          # 
#          # # accounting for potential negative times
#          # TravelTime_Sec = as.numeric(ifelse(Event_Time >= lag(Departure_Time),
#          #                                    Event_Time - lag(Departure_Time),
#          #                                    NA
#          #                                   )
#          #                            ),
#          # TravelTime_Hr = TravelTime_Sec / 3600, # 3,600 seconds in 1 hour
#          # 
#          # # accounting for potential negative or zero travel times
#          # SpeedAvg_Mph = ifelse(TravelTime_Hr > 0,
#          #                       TravelDistance_Mi / TravelTime_Hr,
#          #                       NA
#          #                      )
#         ) %>% 
#   data.frame()
# 
# rm(Stats_StSt_HrGrp)
# str(AllDays_BusDayRoute)

Feature engineering.

Calculating a variable to know if the RouteAlt changed. Could be useful in helping identifying weirdness in calculated distances and speeds.


# rm(Stats_StSt_HrGrp)

AllDays_DirChange <- Stats_StSt_HrGrp %>%  # AllDays_BusDayRoute %>% 
  mutate(RteChange = ifelse(Route == Route_Lag1,
                            "Same",
                            "Change"
                           ),
         RteChange2 = factor(ifelse(is.na(RteChange),
                                    "Change",
                                    RteChange
                                   )
                            ),
         DirChange = ifelse(RouteAlt == RouteAlt_Lag1,
                            "Same",
                            "Change"
                           ),
         DirChange2 = factor(ifelse(is.na(DirChange),
                                    "Change",
                                    DirChange
                                   )
                            )
        )

# rm(AllDays_BusDayRoute)
rm(Stats_StSt_HrGrp)
str(AllDays_DirChange)

View(filter(AllDays_DirChange,
            between(RowNum_OG, 2570060, 2570080)
           ) %>% 
       select(-matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
             )
    )

Re-ordering the variables to ease with comprehension.


AllDays_NewOrder <-  select(AllDays_DirChange,
                            RowNum_OG,
                            UniqueLatLng,
                            group,
                            StartStop_ID,
                            BusDay_EventNum,
                            Bus_ID,
                            Route,
                            RteChange2,
                            RouteAlt,
                            # RouteAlt_Lag1,
                            DirChange2,
                            Route_Direction,
                            Stop_Sequence,
                            Start_ID,
                            Start_Desc,
                            # Stop_ID,
                            StopID_Clean,
                            StopID_Indicator,
                            Stop_Desc,
                            countryCode,
                            Stop_State,
                            Stop_County,
                            Stop_City,
                            Stop_Zip,
                            Event_Type,
                            Event_Description,
                            Event_Time_Yr,
                            Event_Time_Mth,
                            Event_Time_Date,
                            Event_Time_Day,
                            Event_Time_Hr,
                            Event_Time_HrGroup,
                            Event_Time_Min,
                            Event_Time,
                            Departure_Time,
                            Dwell_Time,
                            Dwell_Time2,
                            Delta_Time,
                            Latitude,
                            Longitude,
                            Heading,
                            Odometer_Distance,
                            Odometer_Distance_Lag1,
                            Odometer_Distance_Mi,
                            TravelDistance_Ft,
                            TravelDistance_Mi,
                            TravelDistance_Mi_Hvrs,
                            TD_Mi_q2,
                            TD_Mi_q98,
                            TD_Mi_SS_q5,
                            TD_Mi_SS_q95,
                            TD_Mi_SSHG_q5,
                            TD_Mi_SSHG_q95,
                            TD_Mi_Mean,
                            TD_Mi_Mean_F,
                            TD_Mi_SS_Mean,
                            TD_Mi_SS_Mean_F,
                            TD_Mi_SSHG_Mean,
                            TD_Mi_SSHG_Mean_F,
                            TD_Mi_Med,
                            TD_Mi_Med_F,
                            TD_Mi_SS_Med,
                            TD_Mi_SS_Med_F,
                            TD_Mi_SSHG_Med,
                            TD_Mi_SSHG_Med_F,
                            TD_Mi_Cnt,
                            TD_Mi_Cnt_F,
                            TD_Mi_SS_Cnt,
                            TD_Mi_SS_Cnt_F,
                            TD_Mi_SSHG_Cnt,
                            TD_Mi_SSHG_Cnt_F,
                            TravelTime_Sec,
                            TT_Sec_q2,
                            TT_Sec_q98,
                            TT_Sec_SS_q5,
                            TT_Sec_SS_q95,
                            TT_Sec_SSHG_q5,
                            TT_Sec_SSHG_q95,
                            TT_Sec_Mean,
                            TT_Sec_Mean_F,
                            TT_Sec_SS_Mean,
                            TT_Sec_SS_Mean_F,
                            TT_Sec_SSHG_Mean,
                            TT_Sec_SSHG_Mean_F,
                            TT_Sec_Med,
                            TT_Sec_Med_F,
                            TT_Sec_SS_Med,
                            TT_Sec_SS_Med_F,
                            TT_Sec_SSHG_Med,
                            TT_Sec_SSHG_Med_F,
                            TT_Sec_Cnt,
                            TT_Sec_Cnt_F,
                            TT_Sec_SS_Cnt,
                            TT_Sec_SS_Cnt_F,
                            TT_Sec_SSHG_Cnt,
                            TT_Sec_SSHG_Cnt_F,
                            TravelTime_Hr,
                            TT_Hr_q2,
                            TT_Hr_q98,
                            TT_Hr_SS_q5,
                            TT_Hr_SS_q95,
                            TT_Hr_SSHG_q5,
                            TT_Hr_SSHG_q95,
                            TT_Hr_Mean,
                            TT_Hr_Mean_F,
                            TT_Hr_SS_Mean,
                            TT_Hr_SS_Mean_F,
                            TT_Hr_SSHG_Mean,
                            TT_Hr_SSHG_Mean_F,
                            TT_Hr_Med,
                            TT_Hr_Med_F,
                            TT_Hr_SS_Med,
                            TT_Hr_SS_Med_F,
                            TT_Hr_SSHG_Med,
                            TT_Hr_SSHG_Med_F,
                            TT_Hr_Cnt,
                            TT_Hr_Cnt_F,
                            TT_Hr_SS_Cnt,
                            TT_Hr_SS_Cnt_F,
                            TT_Hr_SSHG_Cnt,
                            TT_Hr_SSHG_Cnt_F,
                            SpeedAvg_Mph
                           )

rm(AllDays_DirChange)
str(select(AllDays_NewOrder,
           -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
          )
   )
str(AllDays_NewOrder)

# View(head(AllDays_NewOrder, 500))
# View(tail(AllDays_NewOrder, 500))

Summarizing the data to help spot anomolies.


View(group_by(AllDays_NewOrder,
              Stop_City) %>% 
       summarise(Cnt_Num = n(),
                 Cnt_Pct = 100*Cnt_Num / (nrow(AllDays_NewOrder)
                                         )
                ) %>% 
       arrange(desc(Cnt_Num))
)

summary(AllDays_NewOrder)

Investigation of TravelDistance_Mi.

View(TravDistMi_Pctiles): 99% of TravelDistance_Mi are about 1 mile or less…but some weird TravelDistance_Mi values (e.g., 584 miles traveled) exist.


TravDistMi_Ntile <- as.data.frame(AllDays_NewOrder$TravelDistance_Mi) %>% 
  mutate(#Pctile = ntile(AllDays_NewOrder$TravelDistance_Mi, 100),
         #MinR = min_rank(AllDays_NewOrder$TravelDistance_Mi),
         PctR = percent_rank(AllDays_NewOrder$TravelDistance_Mi),
         PctR_Round = round(PctR, 2)
        ) 

colnames(TravDistMi_Ntile)[1] <- "TravelDistance_Mi"
# str(TravDistMi_Ntile)

TravDistMi_Ntile_Rows <- nrow(TravDistMi_Ntile)

# View(tail(TravDistMi_Ntile, 500))


TravDistMi_Pctiles <- group_by(TravDistMi_Ntile,
                               PctR_Round
                              ) %>% 
  summarise(
    MinTravDistMiAtPctile = min(TravelDistance_Mi),
    CntsAtPctile = n(),
    PctsAtPctile = CntsAtPctile / TravDistMi_Ntile_Rows
  ) %>% 
  mutate(CumSumPAtP = cumsum(PctsAtPctile)
        )

rm(TravDistMi_Ntile)
rm(TravDistMi_Ntile_Rows)

View(TravDistMi_Pctiles)
TravDistMi_Pctiles

Investigation of TravelDistance_Mi.

Why are some TravelDistance_Mi “NA”? It looks like partially because the records are the first trip of the day (for that bus), so I purposefully set the distance to “NA”. Another reason is due to the odometer recording a value less than the previous odometer recording. In most cases, I have no explanation for this - though I have observed about 67% of all instances where TravelDistance_Mi is NA (other than because it’s the first record of the day) are instances where DirChange2 is “Change”. This is weird and should be asked to WMATA.


# View(head(AllDays_NewOrder, 500))

View(filter(AllDays_NewOrder,
            BusDay_EventNum != 1 # When BusDay_EventNum == 1, TravelDistance_Mi is NA by design (don't want to calculate distance based on yesterday's position)
           ) %>% 
       group_by(StartStop_ID) %>% 
       summarise(Cnts = sum(is.na(TravelDistance_Mi)
                           )
                ) %>% 
       arrange(desc(Cnts)
              )
    )

View(filter(AllDays_NewOrder,
            StartStop_ID == "1000245--1000211"
           ) %>% 
       select(RowNum_OG,
              StartStop_ID,
              Event_Time,
              Event_Time_HrGroup,
              Bus_ID,
              TravelDistance_Mi,
              TravelDistance_Mi_Hvrs,
              TD_Mi_SS_Mean,
              TD_Mi_SS_Mean_F,
              TD_Mi_SSHG_Mean,
              TD_Mi_SSHG_Mean_F,
              TD_Mi_SS_Med,
              TD_Mi_SS_Med_F,
              TD_Mi_SSHG_Med,
              TD_Mi_SSHG_Med_F,
              TD_Mi_SS_Cnt,
              TD_Mi_SS_Cnt_F,
              TD_Mi_SSHG_Cnt,
              TD_Mi_SSHG_Cnt_F
              ) %>% 
       mutate(Ratio_MeanToHvrs = TD_Mi_SS_Mean / TravelDistance_Mi_Hvrs) %>% 
       arrange(Event_Time)
    )

View(filter(AllDays_NewOrder,
            is.na(TravelDistance_Mi)
           )
    )

# These records are NA becuase the record is the first record of the day (the Event_Time_Date)
View(filter(AllDays_NewOrder,
            between(RowNum_OG, 326, 346) | # 336
              between(RowNum_OG, 591, 611) | # 601
              between(RowNum_OG, 845, 865) # 855
           )
    )

Investigation of TravelDistance_Mi.

These records are NA becuase the current record odometer is less than the previous record odometer. Theoretically, this should NOT happen. Me: it appears that about 67% of all instances where TravelDistance_Mi is NA (other than because it’s th first record of the day) are instances where DirChange2 is “Change”. This is weird and should be asked to WMATA.


View(filter(AllDays_NewOrder,
            between(RowNum_OG, 194, 214) | # 204
              between(RowNum_OG, 440, 460) | # 450
              between(RowNum_OG, 478, 498) | # 488
              between(RowNum_OG, 510, 530) # 520
           )
    )

TestTable <- filter(AllDays_NewOrder,
                    BusDay_EventNum != 1
                   ) %>% 
  mutate(TravelDistance_NA = as.factor(ifelse(is.na(TravelDistance_Mi),
                                              "True",
                                              "False"
                                             )
                                      )
        ) %>%
  group_by(DirChange2, TravelDistance_NA) %>%
  summarise(TravDistMi_NACnts = n()
           )

# TestTable

TestTable_Spread <- as.data.frame(spread(TestTable,
                                         TravelDistance_NA,
                                         TravDistMi_NACnts
                                        )
                                 ) %>% 
  select(False,
         True
        )

row.names(TestTable_Spread) <- c("Change", "Same")
# str(TestTable_Spread)
# TestTable_Spread

prop.table(as.table(as.matrix(TestTable_Spread)
                   ),
           1
          )

prop.table(as.table(as.matrix(TestTable_Spread)
                   ),
           2
          )

Investigation of TravelDistance_Mi.

Let’s look at just the TravelDistance_Mi values that are NOT “NA”.


rm(TestTable, TestTable_Spread)

TravelDistance_Mi_NoNA <- filter(AllDays_NewOrder,
                                 # TravelDistance_Mi != 0 &
                                 !is.na(TravelDistance_Mi)
                                )

dim(AllDays_NewOrder)
dim(TravelDistance_Mi_NoNA)
nrow(AllDays_NewOrder) - nrow(TravelDistance_Mi_NoNA)

str(TravelDistance_Mi_NoNA)
summary(TravelDistance_Mi_NoNA)

Investigation of TravelDistance_Mi.

Let’s plot just the TravelDistance_Mi values that are NOT “NA”.


TravDistMi_HistDen <- ggplot(select(TravelDistance_Mi_NoNA,
                                    TravelDistance_Mi
                                   ),
                             aes(x = TravelDistance_Mi,
                                 y = ..density..
                                )
                            ) +
  geom_histogram(binwidth = 0.05, fill = "lightblue", colour = "grey60", size = 0.2) +
  geom_line(stat = "density", colour = "red") +
  coord_cartesian(xlim = c(0, 1.5), ylim = c(0, 4.0)
                 ) +
  labs(title = "Variation in Distance Between Stops",
       x = "Travel Distance (miles)",
       y = "Density"
      )

TravDistMi_HistDen

Investigation of TravelDistance_Mi.

Looking at the extremely large TravelDistance_Mi values. Some (aprox 27%) of TravelDistance_Mi values > 1 mile are when the DirChange2 changes…but what about the other ~73%?


rm(TravelDistance_Mi_NoNA)

# examples of weirdly large TravelDistance_Mi
View(filter(AllDays_NewOrder,
            TravelDistance_Mi > 1.1587121212 # 1.1587121212 is the 99th percentile
           ) %>% 
       arrange(desc(TravelDistance_Mi)
              )
    )


# Why are these extremes?  Airports?  Bus collection points?
View(filter(AllDays_NewOrder,
              between(RowNum_OG, 494044, 494064) | # 494054
              between(RowNum_OG, 494273, 494293) | # 494283
              between(RowNum_OG, 494626, 494646) | # 494636
              between(RowNum_OG, 1610156, 1610176) | # 1610166
              between(RowNum_OG, 2073074, 2073094) # 2073084
           )
    )

# Before Removing Runs
# View(filter(AllDays_Sorted,
#             between(RowNum_OG, 494044, 494064) | # 494054
#               between(RowNum_OG, 494273, 494293) | # 494283
#               between(RowNum_OG, 494626, 494646) | # 494636
#               between(RowNum_OG, 1610156, 1610176) | # 1610166
#               between(RowNum_OG, 2073074, 2073094) # 2073084
#            )
#     )

# After Removing Runs
# View(filter(AllDays_FirstStopID,
#             between(RowNum_OG, 494044, 494064) | # 494054
#               between(RowNum_OG, 494273, 494293) | # 494283
#               between(RowNum_OG, 494626, 494646) | # 494636
#               between(RowNum_OG, 1610156, 1610176) | # 1610166
#               between(RowNum_OG, 2073074, 2073094) # 2073084
#            )
#     )

Investigation of TravelDistance_Mi.

Any relation with DirChange2? Doesn’t look as if this is so.


ExtremeTravDist <- filter(AllDays_NewOrder,
                          !is.na(TravelDistance_Mi)
                         ) %>% 
  mutate(TravDist_Extreme = ifelse(TravelDistance_Mi > 1.1587121212, # 1.1587121212 is the 99th percentile
                                   "True",
                                   "False"
                                  )
                          ) %>% 
  group_by(DirChange2, TravDist_Extreme) %>% 
  summarise(TravDistMI_ExtCnts = n()
           )

# ExtremeTravDist


ExtremeTravDist_Spread <- as.data.frame(spread(ExtremeTravDist,
                                               TravDist_Extreme,
                                               TravDistMI_ExtCnts
                                              )
                                       ) %>% 
  select(False,
         True
        )

row.names(ExtremeTravDist_Spread) <- c("Change", "Same")
# str(ExtremeTravDist_Spread)
# ExtremeTravDist_Spread

prop.table(as.table(as.matrix(ExtremeTravDist_Spread)
                   ),
           1
          )

prop.table(as.table(as.matrix(ExtremeTravDist_Spread)
                   ),
           2
          )

Investigation of TravelDistance_Mi.

Looking at specific buses and StartStop_ID.


rm(ExtremeTravDist, ExtremeTravDist_Spread)

View(arrange(group_by(AllDays_NewOrder,
                      Bus_ID
                     ) %>% 
               summarise(DistTrav_Mean = mean(TravelDistance_Mi, na.rm = TRUE),
                         DistTrav_Med = median(TravelDistance_Mi, na.rm = TRUE)
                        ),
             desc(DistTrav_Med)
            )
    )


# example of extremely small TravelDistance_Mi values (looks like the odometer wasn't functioning)
View(filter(AllDays_NewOrder,
            Bus_ID == 6111 |
              Bus_ID == 7201 |
              Bus_ID == 8058
           ) %>% 
       arrange(Bus_ID, Event_Time)
    )


View(arrange(group_by(AllDays_NewOrder,
                      StartStop_ID
                     ) %>% 
               summarise(DistTrav_Mean = mean(TravelDistance_Mi, na.rm = TRUE),
                         DistTrav_Med = median(TravelDistance_Mi, na.rm = TRUE)
                        ),
             desc(DistTrav_Med)
            )
    )

# example of extremely large TravelDistance_Mi values...no idea why...
View(filter(AllDays_NewOrder,
            StartStop_ID == "1003665--12" |
              StartStop_ID == "1003665--5001925" |
              StartStop_ID == "3001038--3002565"
           ) %>% 
       arrange(StartStop_ID, Event_Time)
    )

Investigation of TravelDistance_Mi & TravelDistance_Mi_New.

If TravelDisntace_Mi is below the 5th percentile for that StartStop_ID, or if TravelDisntace_Mi is above the 95th percentile for that StartStop_ID, or if TravelDistance_Mi is NA (when the BusDay_EventNum !=1), consider this an outlier. In this case, replace the value with the mean for that StartStop_ID and HourGroup (TD_Mi_SSHG_Mean_F), or if there are not enough values at the HourGroup level, replace it with the mean for that StartStop_ID.


# View(tail(AllDays_NewOrder, 500))

AllDays_NewTravelDist <- 
  mutate(AllDays_NewOrder,
         TravelDistance_Mi_New = ifelse(!is.na(TravelDistance_Mi) & 
                                          (TravelDistance_Mi < TD_Mi_SSHG_q5 |
                                             TravelDistance_Mi > TD_Mi_SSHG_q95
                                          ) &
                                          TD_Mi_SSHG_Cnt_F >= 20,
                                        TD_Mi_SSHG_Mean_F,
                                 ifelse(!is.na(TravelDistance_Mi) & 
                                          (TravelDistance_Mi < TD_Mi_SSHG_q5 |
                                             TravelDistance_Mi > TD_Mi_SSHG_q95
                                          ) &
                                          TD_Mi_SSHG_Cnt_F < 20 &
                                          TD_Mi_SS_Cnt_F >= 20,
                                        TD_Mi_SS_Mean_F,
                                 ifelse(!is.na(TravelDistance_Mi) & 
                                          (TravelDistance_Mi < TD_Mi_SSHG_q5 |
                                             TravelDistance_Mi > TD_Mi_SSHG_q95
                                          ) &
                                          TD_Mi_SS_Cnt_F < 20 &
                                          TD_Mi_SS_Cnt >= 20,
                                        TD_Mi_SS_Mean,
                                 ifelse(is.na(TravelDistance_Mi) &
                                          BusDay_EventNum != 1 &
                                          TravelDistance_Mi_Hvrs != 0,
                                        TravelDistance_Mi_Hvrs,
                                 ifelse(is.na(TravelDistance_Mi) &
                                          BusDay_EventNum != 1 &
                                          TravelDistance_Mi_Hvrs == 0,
                                        TD_Mi_SS_Mean,
                                        TravelDistance_Mi
                                       ))))),
         TravelDistance_Mi_New_Label = 
           factor(ifelse(!is.na(TravelDistance_Mi) &
                           (TravelDistance_Mi < TD_Mi_SSHG_q5 |
                              TravelDistance_Mi > TD_Mi_SSHG_q95
                           ) &
                           TD_Mi_SSHG_Cnt_F >= 20,
                         "TD_Mi_SSHG_Mean_F",
                  ifelse(!is.na(TravelDistance_Mi) &
                           (TravelDistance_Mi < TD_Mi_SSHG_q5 |
                              TravelDistance_Mi > TD_Mi_SSHG_q95
                           ) &
                           TD_Mi_SSHG_Cnt_F < 20 &
                           TD_Mi_SS_Cnt_F >= 20,
                         "TD_Mi_SS_Mean_F",
                  ifelse(!is.na(TravelDistance_Mi) &
                           (TravelDistance_Mi < TD_Mi_SSHG_q5 |
                              TravelDistance_Mi > TD_Mi_SSHG_q95
                           ) &
                           TD_Mi_SS_Cnt_F < 20 &
                           TD_Mi_SS_Cnt >= 20,
                         "TD_Mi_SS_Mean",
                  ifelse(is.na(TravelDistance_Mi) &
                           BusDay_EventNum != 1 &
                           TravelDistance_Mi_Hvrs != 0,
                         "TravelDistance_Mi_Hvrs",
                  ifelse(is.na(TravelDistance_Mi) &
                           BusDay_EventNum != 1 &
                           TravelDistance_Mi_Hvrs == 0,
                         "TD_Mi_SS_Mean",
                         "TravelDistance_Mi"
                        )))))
                 ),
         TravelDistance_Mi_NewHvrs = ifelse(!is.na(TravelDistance_Mi_Hvrs) &
                                              TravelDistance_Mi_Hvrs != 0 &
                                              (TravelDistance_Mi_New < TD_Mi_q2 |
                                                 TravelDistance_Mi_New > TD_Mi_q98
                                              ),
                                            TravelDistance_Mi_Hvrs,
                                            TravelDistance_Mi_New
                                           ),
         TravelDistance_Mi_NewHvrs_Label =
           factor(ifelse(!is.na(TravelDistance_Mi_Hvrs) &
                           TravelDistance_Mi_Hvrs != 0 &
                           (TravelDistance_Mi_New < TD_Mi_q2 |
                              TravelDistance_Mi_New > TD_Mi_q98
                           ),
                         "TravelDistance_Mi_Hvrs",
                         as.character(TravelDistance_Mi_New_Label)
                        )
                 ),
         SpeedAvg_Mph_NewHvrs = TravelDistance_Mi_NewHvrs / TravelTime_Hr
        )

str(AllDays_NewTravelDist)

Investigation of TravelDistance_Mi & TravelDistance_Mi_Hvrs & TravelDistance_Mi_New.

Quick summary and then correlation calculation.


rm(AllDays_NewOrder)


# 38 rows meet this criteria anymore  --  appears to be the case when both the Lat Long calculations, and the TravelDistance calculations did not function properly.
View(filter(AllDays_NewTravelDist,
            is.na(TravelDistance_Mi_New) &
              BusDay_EventNum != 1
           )
    )

View(AllDays_NewTravelDist %>% 
       arrange(desc(TravelDistance_Mi_New)) %>% 
       head(500)
    )

summary(select(AllDays_NewTravelDist,
               TravelDistance_Mi,
               TravelDistance_Mi_Hvrs,
               TravelDistance_Mi_New,
               TravelDistance_Mi_NewHvrs
              )
       )

summary(select(filter(AllDays_NewTravelDist,
                      BusDay_EventNum != 1
                     ),
               TravelDistance_Mi,
               TravelDistance_Mi_Hvrs,
               TravelDistance_Mi_New,
               TravelDistance_Mi_NewHvrs
              )
       )


cor(select(AllDays_NewTravelDist,
           TravelDistance_Mi,
           TravelDistance_Mi_Hvrs,
           TravelDistance_Mi_New,
           TravelDistance_Mi_NewHvrs
          ),
    use = "pairwise.complete.obs"
  )

Investigation of TravelDistance_Mi_NewHvrs_Label & TravelDistance_Mi_NewHvrs_Label.

Show how the labels changed.


group_by(AllDays_NewTravelDist,
         TravelDistance_Mi_New_Label,
         TravelDistance_Mi_NewHvrs_Label
        ) %>% 
  summarise(CntNum = n(),
            CntPct = format(CntNum / nrow(AllDays_NewTravelDist),
                            scientific = 9999
                           )
           ) %>% 
  arrange(desc(CntPct)
         )

Investigation of TravelDistance_Mi & TravelDistance_Mi_Hvrs & TravelDistance_Mi_New.

Graphing the two methods of calculating TravelDistance_Mi.

First, let’s get create a function to plot the liner model equation.


lm_eqn <- function(df, y, x){
  m <- lm(y ~ x, df)
  
  l <- list(a = format(coef(m)[1], digits = 2),
            b = format(abs(coef(m)[2]), digits = 2),
            s1 = ifelse(test = coef(m)[2] > 0,
                        yes = "+",
                        no = "-"
                       ),
            r2 = format(summary(m)$r.squared,
                        digits = 3
                       )
           )
  
  eq <- substitute(italic(y) == a~~s1~~b %.% italic(x)*","~~italic(r)^2~"="~r2,
                   l
                  )
  
  as.character(as.expression(eq)
              )             
}

Investigation of TravelDistance_Mi & TravelDistance_Mi_NewHvrs.

Scatter plot (using a 10% sample to making plotting time faster and to reduce un-needed data in the “same” splot).


set.seed(123456789)
AllDays_NewTravelDist_10Pct <- filter(AllDays_NewTravelDist,
                                      !is.na(TravelDistance_Mi_NewHvrs) &
                                        !is.na(TravelDistance_Mi)
                                     ) %>% 
  rename(DistMethod = TravelDistance_Mi_NewHvrs_Label) %>% 
  sample_frac(0.1)


TravDist_MiVsCalc <- ggplot(select(AllDays_NewTravelDist_10Pct,
                                   TravelDistance_Mi_NewHvrs,
                                   TravelDistance_Mi,
                                   DistMethod
                                  ),
                            aes(x = TravelDistance_Mi,
                                y = TravelDistance_Mi_NewHvrs,
                                colour = DistMethod
                               )
                           ) +
  scale_colour_manual(values = c("red","blue", "green", "orange", "black")
                     ) +
  geom_point(shape = 1, alpha = 0.5) +
  scale_shape(solid = FALSE) +
  geom_smooth(method = "lm", colour = "blue") +
  geom_abline(intercept = 0, slope = 1, colour = "red") +
  coord_cartesian(xlim = c(0, 1.5), ylim = c(0, 1.5)
                 ) +
  scale_x_continuous(breaks = seq(0, 1.5, 0.25)
                    ) +
  scale_y_continuous(breaks = seq(0, 1.5, 0.25)
                    ) +
  theme(legend.position = "bottom", #c(0.85, 0.40),
        legend.text = element_text(size = 6)
       ) +
  annotate(label = lm_eqn(df = AllDays_NewTravelDist_10Pct,
                          x = AllDays_NewTravelDist_10Pct$TravelDistance_Mi,
                          y = AllDays_NewTravelDist_10Pct$TravelDistance_Mi_NewHvrs
                         ),
           # x = 62,
           # y = 20,
           x = 0.70,
           y = 0.00,
           geom = "text",
           size = 3,
           colour = "blue",
           parse = TRUE
          ) +
  annotate(label = "Reference Line (slope = 1)",
           # x = 16,
           # y = 30,
           x = 0.80,
           y = 1.05,
           geom = "text",
           size = 3,
           colour = "red"
          ) +
  labs(title = "TravelDistance_Mi vs. TravelDistance_Mi_NewHvrs",
       x = "TravelDistance_Mi",
       y = "TravelDistance_Mi_NewHvrs"
      )
# +
#   geom_jitter()

TravDist_MiVsCalc

Investigation of TravelDistance_Mi & TravelDistance_Mi_Hvrs & TravelDistance_Mi_New.

Graphing test with rbokeh.


TravDist_MiVsCalc_Bokeh <- figure(data = select(AllDays_NewTravelDist_10Pct,
                                                TravelDistance_Mi_NewHvrs,
                                                TravelDistance_Mi,
                                                DistMethod
                                               ),
                                  xlim = c(0, 1.5),
                                  ylim = c(0, 1.5),
                                  legend_location = "bottom_right"
                                 ) %>% 
  ly_points(x = TravelDistance_Mi,
            y = TravelDistance_Mi_NewHvrs,
            color = DistMethod,
            hover = c(TravelDistance_Mi_NewHvrs, TravelDistance_Mi, DistMethod)
           ) %>% 
  ly_abline(a = 0, b = 1, color = "red")

TravDist_MiVsCalc_Bokeh

Investigation of TravelDistance_Mi_New.

Calculating the minimum TravelDistance_Mi_New value at each percentile.


rm(TravDist_MiVsCalc_Bokeh)
rm(AllDays_NewTravelDist_10Pct)


summary(select(AllDays_NewTravelDist,
               TravelDistance_Mi,
               TravelDistance_Mi_Hvrs,
               TravelDistance_Mi_New,
               TravelDistance_Mi_NewHvrs
              )
       )

summary(select(filter(AllDays_NewTravelDist,
                      BusDay_EventNum != 1
                     ),
               TravelDistance_Mi,
               TravelDistance_Mi_Hvrs,
               TravelDistance_Mi_New,
               TravelDistance_Mi_NewHvrs
              )
       )


TravDistMiN_Ntile <- as.data.frame(select(AllDays_NewTravelDist,
                                          StartStop_ID,
                                          TravelDistance_Mi_New_Label,
                                          # TravelDistance_Mi_NewHvrs_Label,
                                          TravelDistance_Mi_New
                                          # TravelDistance_Mi_NewHvrs
                                         )
                                  ) %>% 
  mutate(PctR_N = percent_rank(AllDays_NewTravelDist$TravelDistance_Mi_New),
         # PctR_H = percent_rank(AllDays_NewTravelDist$TravelDistance_Mi_NewHvrs),
         PctR_Round_N = round(PctR_N, 2)
         # PctR_Round_H = round(PctR_H, 2)
        ) 

# str(TravDistMiN_Ntile)
# View(head(TravDistMiN_Ntile, 500))

TravDistMiN_Ntile_Rows <- nrow(TravDistMiN_Ntile)

# View(tail(TravDistMiN_Ntile, 500))


TravDistMiN_Pctiles <- group_by(TravDistMiN_Ntile,
                                PctR_Round_N
                               ) %>% 
  summarise(
    MinTDMiAtPctile_N = min(TravelDistance_Mi_New),
    # MinTDMiAtPctile_H = min(TravelDistance_Mi_NewHvrs),
    CntsAtPctile_N = sum(!is.na(TravelDistance_Mi_New)),
    # CntsAtPctile_H = sum(!is.na(TravelDistance_Mi_NewHvrs)),
    PctsAtPctile_N = CntsAtPctile_N / TravDistMiN_Ntile_Rows
    # PctsAtPctile_H = CntsAtPctile_H / TravDistMiN_Ntile_Rows
  ) %>% 
  mutate(CumSumPAtP_N = cumsum(PctsAtPctile_N)
         # CumSumPAtP_H = cumsum(PctsAtPctile_H)
        )

# View(TravDistMiN_Pctiles)

Investigation of TravelDistance_Mi_NewHvrs

Calculating the minimum TravelDistance_Mi_NewHvrs value at each percentile.


TravDistMiH_Ntile <- as.data.frame(select(AllDays_NewTravelDist,
                                          StartStop_ID,
                                          # TravelDistance_Mi_New_Label,
                                          TravelDistance_Mi_NewHvrs_Label,
                                          # TravelDistance_Mi_New,
                                          TravelDistance_Mi_NewHvrs
                                         )
                                  ) %>% 
  mutate(# PctR_N = percent_rank(AllDays_NewTravelDist$TravelDistance_Mi_New),
         PctR_H = percent_rank(AllDays_NewTravelDist$TravelDistance_Mi_NewHvrs),
         # PctR_Round_N = round(PctR_N, 2),
         PctR_Round_H = round(PctR_H, 2)
        ) 

# str(TravDistMiH_Ntile)
# View(head(TravDistMiH_Ntile, 500))

TravDistMiH_Ntile_Rows <- nrow(TravDistMiH_Ntile)

# View(tail(TravDistMiH_Ntile, 500))


TravDistMiH_Pctiles <- group_by(TravDistMiH_Ntile,
                                PctR_Round_H
                               ) %>% 
  summarise(
    # MinTDMiAtPctile_N = min(TravelDistance_Mi_New),
    MinTDMiAtPctile_H = min(TravelDistance_Mi_NewHvrs),
    # CntsAtPctile_N = sum(!is.na(TravelDistance_Mi_New)),
    CntsAtPctile_H = sum(!is.na(TravelDistance_Mi_NewHvrs)),
    # PctsAtPctile_N = CntsAtPctile_N / TravDistMiH_Ntile_Rows,
    PctsAtPctile_H = CntsAtPctile_H / TravDistMiH_Ntile_Rows
  ) %>% 
  mutate(# CumSumPAtP_N = cumsum(PctsAtPctile_N),
         CumSumPAtP_H = cumsum(PctsAtPctile_H)
        )

# View(TravDistMiH_Pctiles)

Join TravDistMiH_Pctiles, TravDistMiN_Pctiles, and TravDistMi_Pctiles.

~11% of rides are still showing as less than 0.1 miles of TravelDistance_Mi_NewHvrs.


rm(TravDistMiN_Ntile_Rows, TravDistMiH_Ntile_Rows, TravDistMiN_Ntile, TravDistMiH_Ntile)


# View(TravDistMi_Pctiles)
# View(TravDistMiN_Pctiles)
# View(TravDistMiH_Pctiles)

TravDistMi_Pctiles_All <- inner_join(x = TravDistMi_Pctiles,
                                     y = TravDistMiN_Pctiles,
                                     by = c("PctR_Round" = "PctR_Round_N")
                                    ) %>% 
  inner_join(y = TravDistMiH_Pctiles,
             by = c("PctR_Round" = "PctR_Round_H")
            ) %>% 
  select(PctR_Round,
         MinTravDistMiAtPctile,
         MinTDMiAtPctile_N,
         MinTDMiAtPctile_H,
         CntsAtPctile,
         CntsAtPctile_N,
         CntsAtPctile_H,
         PctsAtPctile,
         PctsAtPctile_N,
         PctsAtPctile_H,
         CumSumPAtP,
         CumSumPAtP_N,
         CumSumPAtP_H
         )

# str(TravDistMi_Pctiles_All)

rm(TravDistMi_Pctiles, TravDistMiN_Pctiles,TravDistMiH_Pctiles)


View(TravDistMi_Pctiles_All)
TravDistMi_Pctiles_All

Investigation of TravelDistance_Mi_New.

Why are there still some small or large TravelDistance_Mi_NewHvrs values.


# View(filter(AllDays_NewTravelDist,
#             !is.na(TravelDistance_Mi_NewHvrs)
#            ) %>% 
#        select(-TD_Mi_q2,
#               -TD_Mi_q98,
#               -TD_Mi_SS_q5,
#               -TD_Mi_SS_q95,
#               -TD_Mi_SSHG_q5,
#               -TD_Mi_SSHG_q95,
#               -TD_Mi_Mean,
#               -TD_Mi_Mean_F,
#               -TD_Mi_SS_Mean,
#               -TD_Mi_SS_Mean_F,
#               -TD_Mi_SSHG_Mean,
#               -TD_Mi_SSHG_Mean_F,
#               -TD_Mi_Med,
#               -TD_Mi_Med_F,
#               -TD_Mi_SS_Med,
#               -TD_Mi_SS_Med_F,
#               -TD_Mi_SSHG_Med,
#               -TD_Mi_SSHG_Med_F,
#               -TD_Mi_Cnt,
#               -TD_Mi_Cnt_F,
#               -TD_Mi_SS_Cnt,
#               -TD_Mi_SS_Cnt_F,
#               -TD_Mi_SSHG_Cnt,
#               -TD_Mi_SSHG_Cnt_F,
#               -TT_Sec_q2,
#               -TT_Sec_q98,
#               -TT_Sec_SS_q5,
#               -TT_Sec_SS_q95,
#               -TT_Sec_SSHG_q5,
#               -TT_Sec_SSHG_q95,
#               -TT_Sec_Mean,
#               -TT_Sec_Mean_F,
#               -TT_Sec_SS_Mean,
#               -TT_Sec_SS_Mean_F,
#               -TT_Sec_SSHG_Mean,
#               -TT_Sec_SSHG_Mean_F,
#               -TT_Sec_Med,
#               -TT_Sec_Med_F,
#               -TT_Sec_SS_Med,
#               -TT_Sec_SS_Med_F,
#               -TT_Sec_SSHG_Med,
#               -TT_Sec_SSHG_Med_F,
#               -TT_Sec_Cnt,
#               -TT_Sec_Cnt_F,
#               -TT_Sec_SS_Cnt,
#               -TT_Sec_SS_Cnt_F,
#               -TT_Sec_SSHG_Cnt,
#               -TT_Sec_SSHG_Cnt_F,
#               -TT_Hr_q2,
#               -TT_Hr_q98,
#               -TT_Hr_SS_q5,
#               -TT_Hr_SS_q95,
#               -TT_Hr_SSHG_q5,
#               -TT_Hr_SSHG_q95,
#               -TT_Hr_Mean,
#               -TT_Hr_Mean_F,
#               -TT_Hr_SS_Mean,
#               -TT_Hr_SS_Mean_F,
#               -TT_Hr_SSHG_Mean,
#               -TT_Hr_SSHG_Mean_F,
#               -TT_Hr_Med,
#               -TT_Hr_Med_F,
#               -TT_Hr_SS_Med,
#               -TT_Hr_SS_Med_F,
#               -TT_Hr_SSHG_Med,
#               -TT_Hr_SSHG_Med_F,
#               -TT_Hr_Cnt,
#               -TT_Hr_Cnt_F,
#               -TT_Hr_SS_Cnt,
#               -TT_Hr_SS_Cnt_F,
#               -TT_Hr_SSHG_Cnt,
#               -TT_Hr_SSHG_Cnt_F
#              ) %>% 
#        arrange(TravelDistance_Mi_NewHvrs) %>% 
#        head(500)
#     )

View(filter(AllDays_NewTravelDist,
            !is.na(TravelDistance_Mi_NewHvrs)
           ) %>% 
       select(-matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
             ) %>% 
       arrange(TravelDistance_Mi_NewHvrs) %>%
       head(500)
    )

# examples of the smallest TravelDistance_Mi_NewHvrs values.
View(filter(AllDays_NewTravelDist,
            (RowNum_OG >= 1424440 & RowNum_OG <= 1424460) | # 1424450  --  direction change
                (RowNum_OG >= 763292 & RowNum_OG <= 763312) | # 763302  --  direction change
                (RowNum_OG >= 1679093 & RowNum_OG <= 1679113) | # 1679103  --  direction change
                (RowNum_OG >= 2860918 & RowNum_OG <= 2860938) # 2860928  --  looks correct
           ) %>% 
       select(-matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
             )
    )


View(filter(AllDays_NewTravelDist,
            !is.na(TravelDistance_Mi_NewHvrs)
           ) %>% 
       select(-matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
             ) %>% 
       arrange(desc(TravelDistance_Mi_NewHvrs)
              ) %>%
       head(500)
    )

# examples of the largest TravelDistance_Mi_NewHvrs values.
View(filter(AllDays_NewTravelDist,
            (RowNum_OG >= 1092000 & RowNum_OG <= 1092050) | # 1092030  --  direction change
                (RowNum_OG >= 1609460 & RowNum_OG <= 1609480) | # 1609470  -- direction change 
                (RowNum_OG >= 508904 & RowNum_OG <= 508924) | # 508914  --  direction change & original StopID was bad
                (RowNum_OG >= 2476345 & RowNum_OG <= 2476365) # 2476355  --  direction change
           ) %>% 
       select(-matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
             )
    )

Investigation of TravelTime_Hr.

View(TravDistMi_Pctiles): 98% of TravelTime_Hr are between 7 seconds and 464 seconds (~8 minutes).


TravTimeHr_Ntile <- select(AllDays_NewTravelDist,
                           TravelTime_Hr
                          ) %>% 
  mutate(# Pctile = ntile(AllDays_NewTravelDist$TravelTime_Hr, 100),
         # MinR = min_rank(AllDays_NewTravelDist$TravelTime_Hr),
         PctR = percent_rank(AllDays_NewTravelDist$TravelTime_Hr),
         PctR_Round = round(PctR, 2)
        ) 

# str(TravTimeHr_Ntile)

TravTimeHr_Ntile_Rows <- nrow(TravTimeHr_Ntile)

# View(tail(TravTimeHr_Ntile, 500))


TravTimeHr_Pctiles <- group_by(TravTimeHr_Ntile,
                               PctR_Round
                              ) %>% 
  summarise(
    MinTravTimeHrAtPctile = min(TravelTime_Hr),
    CntsAtPctile = n(),
    PctsAtPctile = CntsAtPctile / TravTimeHr_Ntile_Rows
  ) %>% 
  mutate(CumSumPAtP = cumsum(PctsAtPctile),
         MinTravTimeSecAtPctile = MinTravTimeHrAtPctile * 3600
        )

rm(TravTimeHr_Ntile_Rows)
rm(TravTimeHr_Ntile)
View(TravTimeHr_Pctiles)
TravTimeHr_Pctiles

Investigation of TravelTime_Hr.

Histogram of TravelTime_Sec.


TravTime_Sec_HistDen <- ggplot(filter(select(AllDays_NewTravelDist,
                                             TravelTime_Sec
                                            ),
                                      !is.na(TravelTime_Sec)
                                     ),
                               aes(x = TravelTime_Sec,
                                   y = ..density..
                                  )
                          ) +
  geom_histogram(binwidth = 5, fill = "lightblue", colour = "grey60", size = 0.2) +
  geom_line(stat = "density", colour = "red") +
  # stat_bin(binwidth = 5,
  #          geom = "text",
  #          size = 2.5,
  #          vjust = 1.5,
  #          aes(label = format(..count.., big.mark = ",")
  #             ),
  #         ) +
  coord_cartesian(xlim = c(0, 180), ylim = c(0, 0.02)
                 ) +
  #  theme(legend.position="none") +
  labs(title = "Variation in Travel Time",
       x = "Travel Time (sec)",
       y = "Density"
      )

TravTime_Sec_HistDen

Investigation of TravelTime_Sec.

TravelTime_Sec values are NA.


summary(AllDays_NewTravelDist$TravelTime_Sec)


View(select(AllDays_NewTravelDist,
            -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
           ) %>% 
       filter(is.na(TravelTime_Sec) &
                BusDay_EventNum != 1  # TravelTime purposefully not calculated here
             )
    )

# examples of TravelTime_Sec values that are NA. These are NA because the Event_Time & Departure_Time readings are not accurate (i.e., the previous Departure_Time is BEFORE or EQUAL TO the current Event_Time).
View(filter(AllDays_NewTravelDist,
            (RowNum_OG >= 90809 & RowNum_OG <= 90829) | # 90819
                (RowNum_OG >= 90881 & RowNum_OG <= 90901) | # 90891
                (RowNum_OG >= 2597066 & RowNum_OG <= 2597086) | # 2597076
                (RowNum_OG >= 2613305 & RowNum_OG <= 2613325) # 2613315
           ) %>% 
       select(-matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt"))
    )

Investigation of TravelTime_Sec.

TravelTime_Sec values are extremely small.


View(select(AllDays_NewTravelDist,
            -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
           ) %>% 
       filter(!is.na(TravelTime_Sec)
             ) %>% 
       arrange(TravelTime_Sec,
               desc(SpeedAvg_Mph_NewHvrs)
              ) %>%
       head(500)
    )

# examples where TravelTime_Sec is small (1 sec) and SpeedAvg_Mph_NewHvrs is large.
View(select(AllDays_NewTravelDist,
            -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
           ) %>% 
       filter((RowNum_OG >= 2217353 & RowNum_OG <= 2217373) | # 2217363
                (RowNum_OG >= 3090321 & RowNum_OG <= 3090341) | # 3090331
                (RowNum_OG >= 80764 & RowNum_OG <= 80784) | # 80774
                (RowNum_OG >= 33840 & RowNum_OG <= 33860) # 33850
           )
    )

Investigation of TravelTime_Sec.

TravelTime_Sec values are extremely large.


View(select(AllDays_NewTravelDist,
            -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
           ) %>% 
       filter(!is.na(TravelTime_Sec)
             ) %>% 
       arrange(desc(TravelTime_Sec),
               SpeedAvg_Mph_NewHvrs
              ) %>%
       head(500)
    )

# examples where TravelTime_Sec is large and SpeedAvg_Mph_NewHvrs is small.
View(select(AllDays_NewTravelDist,
            -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
           ) %>% 
       filter((RowNum_OG >= 1007703 & RowNum_OG <= 1007723) | # 1007713
                (RowNum_OG >= 2373564 & RowNum_OG <= 2373584) | # 2373574
                (RowNum_OG >= 864379 & RowNum_OG <= 864399) | # 864389
                (RowNum_OG >= 2570060 & RowNum_OG <= 2570080) # 2570070
           )
    )

Investigation of TravelTime_Sec.

Are large TravelTime_Sec values related to RouteChanges? Looks likely. When the Bus involves a Route “change”, there is almost twice as likely to be a case of an outlier TravelTime_Sec value (on the high side).


TTLargeRteChng <- select(AllDays_NewTravelDist,
                         -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
                        ) %>% 
  mutate(TT_Out = factor(ifelse(TravelTime_Sec > 464,  # this is the 99th percentile
                                "Outlier",
                                "Normal"
                               )
                        )
        )

# str(TTLargeRteChng)


TTLargeRteChng_Cnts <- group_by(TTLargeRteChng,
                                RteChange2,
                                TT_Out
                               ) %>% 
  summarise(Cnts = n()
           )

TTLargeRteChng_Spread <- as.data.frame(spread(TTLargeRteChng_Cnts,
                                              TT_Out,
                                              Cnts
                                             )
                                      ) %>%
  select(-RteChange2)

row.names(TTLargeRteChng_Spread) <- c("Change", "Same")
# str(TTLargeRteChng_Spread)


# When the Bus involves a Route "change", there is almost twice as likely to be a case of an outlier TravelTime_Sec value.
TTLargeRteChng_Spread
prop.table(as.table(as.matrix(TTLargeRteChng_Spread)
                   ),
           1
          )

prop.table(as.table(as.matrix(TTLargeRteChng_Spread)
                   ),
           2
          )

# rm(TTLargeRteChng, TTLargeRteChng_Spread)
         

View(filter(TTLargeRteChng,
            !is.na(TravelTime_Sec) &
              RteChange2 == "Same"
           ) %>% 
       arrange(desc(TravelTime_Sec),
               SpeedAvg_Mph_NewHvrs
              ) %>%
       head(500)
    )


# examples where TravelTime_Sec is large and SpeedAvg_Mph_NewHvrs is small.
View(filter(TTLargeRteChng,
            (RowNum_OG >= 2250290 & RowNum_OG <= 2250310) | # 2250300
              (RowNum_OG >= 867717 & RowNum_OG <= 867737) | # 867727
              (RowNum_OG >= 864379 & RowNum_OG <= 864399) | # 864389
              (RowNum_OG >= 808395 & RowNum_OG <= 808415) # 808405
           )
    )

Investigation of TravelTime_Sec.

If TravelTime_Sec is below the 5th percentile for that StartStop_ID, or if TravelTime_Sec is above the 95th percentile for that StartStop_ID, consider this an outlier. In this case, replace the value with the mean for that StartStop_ID and HourGroup (TT_Sec_SSHG_Mean_F), or if there are not enough values at the HourGroup level, replace it with the mean for that StartStop_ID.


rm(TTLargeRteChng, TTLargeRteChng_Cnts, TTLargeRteChng_Spread)


NewTravTime <- mutate(AllDays_NewTravelDist,
                      TT_Sec_New = ifelse(!is.na(TravelTime_Sec) &
                                            (TravelTime_Sec < TT_Sec_SSHG_q5 |
                                               TravelTime_Sec > TT_Sec_SSHG_q95
                                            ) &
                                            TT_Sec_SSHG_Cnt_F >= 20,
                                          TT_Sec_SSHG_Mean_F,
                                   ifelse(!is.na(TravelTime_Sec) &
                                            (TravelTime_Sec < TT_Sec_SSHG_q5 |
                                               TravelTime_Sec > TT_Sec_SSHG_q95
                                            ) &
                                            TT_Sec_SSHG_Cnt_F < 20 &
                                            TT_Sec_SS_Cnt_F >= 20,
                                          TT_Sec_SS_Mean_F,
                                   ifelse(!is.na(TravelTime_Sec) &
                                            (TravelTime_Sec < TT_Sec_SSHG_q5 |
                                               TravelTime_Sec > TT_Sec_SSHG_q95
                                            ) &
                                            TT_Sec_SS_Cnt_F < 20 &
                                            TT_Sec_SS_Cnt >= 20,
                                          TT_Sec_SS_Mean,
                                   ifelse(!is.na(TravelTime_Sec) &
                                            (TravelTime_Sec < TT_Sec_SSHG_q5 |
                                               TravelTime_Sec > TT_Sec_SSHG_q95
                                            ) &
                                            TT_Sec_SS_Cnt_F < 20 &
                                            TT_Sec_SS_Cnt < 20 &
                                            RteChange2 == "Change",
                                          NA,
                                          TravelTime_Sec
                                         )))),
                      
                      TT_Sec_New_Label = 
           factor(ifelse(!is.na(TravelTime_Sec) &
                           (TravelTime_Sec < TT_Sec_SSHG_q5 |
                              TravelTime_Sec > TT_Sec_SSHG_q95
                           ) &
                           TT_Sec_SSHG_Cnt_F >= 20,
                         "TT_Sec_SSHG_Mean_F",
                  ifelse(!is.na(TravelTime_Sec) &
                           (TravelTime_Sec < TT_Sec_SSHG_q5 |
                              TravelTime_Sec > TT_Sec_SSHG_q95
                           ) &
                           TT_Sec_SSHG_Cnt_F < 20 &
                           TT_Sec_SS_Cnt_F >= 20,
                         "TT_Sec_SS_Mean_F",
                  ifelse(!is.na(TravelTime_Sec) &
                           (TravelTime_Sec < TT_Sec_SSHG_q5 |
                              TravelTime_Sec > TT_Sec_SSHG_q95
                            ) &
                           TT_Sec_SS_Cnt_F < 20 &
                           TT_Sec_SS_Cnt >= 20,
                         "TT_Sec_SS_Mean",
                  ifelse(!is.na(TravelTime_Sec) &
                           (TravelTime_Sec < TT_Sec_SSHG_q5 |
                              TravelTime_Sec > TT_Sec_SSHG_q95
                           ) &
                           TT_Sec_SS_Cnt_F < 20 &
                           TT_Sec_SS_Cnt < 20 &
                           RteChange2 == "Change",
                         NA,
                         "TravelTime_Sec"
                        ))))
                 ),
                  
                  TT_Hr_New = TT_Sec_New / (60 * 60)
           )


dim(AllDays_NewTravelDist)
dim(NewTravTime)
rm(AllDays_NewTravelDist)

summary(select(NewTravTime,
           -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
          )
   )

str(select(NewTravTime,
           TravelTime_Sec,
           TT_Sec_New,
           TT_Sec_New_Label,
           TT_Hr_New
          )
   )


summary(select(NewTravTime,
               TravelTime_Sec,
               TT_Sec_New,
               TT_Sec_New_Label,
               TT_Hr_New
              )
       )

Test investigation of just the X2 Route. Box plots for time between bus arrivals (by HourGroup).


View(head(select(NewTravTime,
                 -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
                )
         )
    )

X2 <- select(NewTravTime,
             -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
            ) %>% 
  filter(Route == "X2")

str(X2)

View(head(arrange(X2,
                  Bus_ID,
                  Event_Time
                 ),
          500
         )
    )

X2_ByStop <- group_by(X2,
                      StopID_Clean
                     ) %>% 
  arrange(StopID_Clean,
          Event_Time) %>% 
  mutate(Event_Time_L1 = lag(Event_Time),
         TimeToEvent_Sec = as.numeric(Event_Time - Event_Time_L1),
         TimeToEvent_Min = TimeToEvent_Sec / 60
        )

View(head(X2_ByStop, 500))


# Count_Values is needed to display the medians on the box plots
Count_Values <- ddply(as.data.frame(X2_ByStop),
                      .(Event_Time_HrGroup),
                      summarise,
                      Value_Counts = median(TimeToEvent_Min, na.rm = TRUE)
                     )

TimeBtwEvents_X2_BoxPlot <- ggplot(select(as.data.frame(X2_ByStop),
                                          TimeToEvent_Min,
                                          Event_Time_HrGroup
                                         ),
                                   aes(factor(Event_Time_HrGroup),
                                       TimeToEvent_Min,
                                       fill = factor(Event_Time_HrGroup)
                                      )
                                  ) + 
  geom_boxplot(outlier.colour="red", notch=TRUE, na.rm = TRUE) +
  geom_text(data = Count_Values,
            aes(y = Value_Counts,
                label = format(round(Value_Counts, digits = 1),
                               nsmall = 1
                              )
               ),
            size = 3,
            vjust = -0.5
           ) +
  theme(legend.position="none", axis.text.x = element_text(angle=45)) +
  coord_cartesian(# xlim = c(0, 180),
                  ylim = c(0, 120)
                 ) +
  labs(title = "How Often an X2 Arrives at a Given Stop",
       x = "Hour Group",
       y = "Time Between Busses (min)"
      )

TimeBtwEvents_X2_BoxPlot

Test investigation of just the X2 Route. Violin plots for time between bus arrivals (by Hour Group).


TimeBtwEvents_X2_ViolinPlot <- ggplot(select(as.data.frame(X2_ByStop),
                                             TimeToEvent_Min,
                                             Event_Time_HrGroup
                                             ),
                                      aes(factor(Event_Time_HrGroup),
                                          TimeToEvent_Min,
                                          fill = factor(Event_Time_HrGroup)
                                         )
                                     ) + 
  geom_violin(draw_quantiles = c(0.25, 0.5, 0.75),
              trim = TRUE,
              scale = "count",
              na.rm = TRUE,
              show.legend = NA,
              inherit.aes = TRUE
             ) +
  geom_text(data = Count_Values,
            aes(y = Value_Counts,
                label = format(round(Value_Counts, digits = 1),
                               nsmall = 1
                              )
               ),
            size = 2.5,
            vjust = -0.5
           ) +
  theme(legend.position="none", axis.text.x = element_text(angle=45)) +
  coord_cartesian(# xlim = c(0, 180),
                  ylim = c(0, 80)
                 ) +
  labs(title = "How Often an X2 Arrives at a Given Stop",
       x = "Hour Group",
       y = "Time Between Busses (min)"
      )

TimeBtwEvents_X2_ViolinPlot

Test investigation of just the X2 Route. Box plots for time between bus arrivals (by Zip Code).


# Count_Values is needed to display the medians on the box plots
Count_Values_z <- ddply(as.data.frame(X2_ByStop),
                        .(Stop_Zip),
                        summarise,
                        Value_Counts = median(TimeToEvent_Min, na.rm = TRUE)
                       )

TimeBtwEvents_X2_BoxPlot_z <- ggplot(select(as.data.frame(X2_ByStop),
                                            TimeToEvent_Min,
                                            Stop_Zip
                                           ),
                                     aes(factor(Stop_Zip),
                                         TimeToEvent_Min,
                                         fill = factor(Stop_Zip)
                                        )
                                    ) + 
  geom_boxplot(outlier.colour="red", notch=TRUE, na.rm = TRUE) +
  geom_text(data = Count_Values_z,
            aes(y = Value_Counts,
                label = format(round(Value_Counts, digits = 1),
                               nsmall = 1
                              )
               ),
            size = 3,
            vjust = -0.5
           ) +
  theme(legend.position="none", axis.text.x = element_text(angle=45)) +
  coord_cartesian(# xlim = c(0, 180),
                  ylim = c(0, 100)
                 ) +
  labs(title = "How Often an X2 Arrives at a Given Stop",
       x = "Zip Code of Destination",
       y = "Time Between Busses (min)"
      )

TimeBtwEvents_X2_BoxPlot_z

Test investigation of just the X2 Route. Violin plots for time between bus arrivals (by Zip Code).


TimeBtwEvents_X2_ViolinPlot_z <- ggplot(select(as.data.frame(X2_ByStop),
                                               TimeToEvent_Min,
                                               Stop_Zip
                                               ),
                                        aes(factor(Stop_Zip),
                                            TimeToEvent_Min,
                                            fill = factor(Stop_Zip)
                                           )
                                       ) + 
  geom_violin(draw_quantiles = c(0.25, 0.5, 0.75),
              trim = TRUE,
              scale = "count",
              na.rm = TRUE,
              show.legend = NA,
              inherit.aes = TRUE
             ) +
  geom_text(data = Count_Values_z,
            aes(y = Value_Counts,
                label = format(round(Value_Counts, digits = 1),
                               nsmall = 1
                              )
               ),
            size = 2.5,
            vjust = -0.5
           ) +
  theme(legend.position="none", axis.text.x = element_text(angle=45)) +
  coord_cartesian(# xlim = c(0, 180),
                  ylim = c(0, 60)
                 ) +
  labs(title = "How Often an X2 Arrives at a Given Stop",
       x = "Zip Code of Destination",
       y = "Time Between Busses (min)"
      )

TimeBtwEvents_X2_ViolinPlot_z

Waiting time analyses.

Munging and sampling data to go from time beteen buses to “average” waiting time.

First, get the max and min times of bus stops (each day, and for each route).

RouteMinMax <- group_by(NewTravTime,
                        Route,
                        Event_Time_Date
                       ) %>% 
  summarise(MinTime = min(Event_Time),
            MaxTime = max(Event_Time)
           )
str(RouteMinMax)
Classes ‘grouped_df’, ‘tbl_df’, ‘tbl’ and 'data.frame': 1329 obs. of  4 variables:
 $ Route          : chr  "10A" "10A" "10A" "10A" ...
 $ Event_Time_Date: int  3 4 5 6 7 3 4 5 6 7 ...
 $ MinTime        : POSIXct, format: "2016-10-03 00:00:19" "2016-10-04 00:00:54" ...
 $ MaxTime        : POSIXct, format: "2016-10-03 23:57:27" "2016-10-04 23:59:23" ...
 - attr(*, "vars")=List of 1
  ..$ : symbol Route
 - attr(*, "drop")= logi TRUE
View(RouteMinMax)

Waiting time analyses.

Munging and sampling data to go from time beteen buses to “average” waiting time.

(Pulls here are done by day, as the data are too large to do at once.)

# Bind all the individual dataframes together.
WaitData_DayPull <- bind_rows(Testing_3,
                              Testing_4,
                              Testing_5,
                              Testing_6,
                              Testing_7
                             ) %>% 
  mutate(WaitTime_Sec3 = NB - SampTime,
         WaitTime_Min3 = WaitTime_Sec3 / 60
        ) %>% 
  arrange(Route,
          StopID_Clean,
          Event_Time
         )
str(WaitData_DayPull)
'data.frame':   2666526 obs. of  23 variables:
 $ RowNum_OG         : int  771269 510393 842137 416282 403679 478483 842251 403790 842364 403906 ...
 $ Route             : chr  "10A" "10A" "10A" "10A" ...
 $ Event_Time_Date   : int  3 3 3 3 3 3 3 3 3 3 ...
 $ StopID_Clean      : chr  "2" "2" "2" "2" ...
 $ Event_Type        : int  3 4 3 3 4 3 3 4 3 4 ...
 $ Event_Description : Factor w/ 3 levels "Serviced Stop                                     ",..: 1 3 1 1 3 1 1 3 1 3 ...
 $ Event_Time_Yr     : int  2016 2016 2016 2016 2016 2016 2016 2016 2016 2016 ...
 $ Event_Time_Mth    : int  10 10 10 10 10 10 10 10 10 10 ...
 $ Event_Time_Day    : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tues"<..: 2 2 2 2 2 2 2 2 2 2 ...
 $ Event_Time_Hr     : int  0 1 5 5 6 6 7 8 9 10 ...
 $ Event_Time_HrGroup: Ord.factor w/ 8 levels "Group0_2"<"Group3_5"<..: 1 1 2 2 3 3 3 3 4 4 ...
 $ Event_Time_Min    : int  1 3 5 38 21 40 12 36 21 27 ...
 $ Event_Time        : POSIXct, format: "2016-10-03 00:01:53" "2016-10-03 01:03:51" ...
 $ MinTime           : POSIXct, format: "2016-10-03 00:00:19" "2016-10-03 00:00:19" ...
 $ MaxTime           : POSIXct, format: "2016-10-03 23:57:27" "2016-10-03 23:57:27" ...
 $ SampTime          : POSIXct, format: "2016-10-03 15:35:56" "2016-10-03 10:41:52" ...
 $ NB                : POSIXct, format: "2016-10-03 16:01:44" "2016-10-03 10:42:48" ...
 $ WaitTime_Min      : num  25.79 55.82 2.44 28.76 1.48 ...
 $ WaitTime_Sec      : num  1547.1 3349.2 146.6 1725.5 88.9 ...
 $ WaitTime_Sec2     : num  25.79 55.82 2.44 28.76 1.48 ...
 $ WaitTime_Min2     : num  0.4298 0.9303 0.0407 0.4793 0.0247 ...
 $ WaitTime_Sec3     :Class 'difftime'  atomic [1:2666526] 1547.1 55.8 8794.3 1725.5 5333.4 ...
  .. ..- attr(*, "units")= chr "secs"
 $ WaitTime_Min3     :Class 'difftime'  atomic [1:2666526] 25.79 0.93 146.57 28.76 88.89 ...
  .. ..- attr(*, "units")= chr "secs"
View(head(WaitData_DayPull, 500))
View(tail(WaitData_DayPull, 500))

Waiting time analyses.

Munging and sampling data to go from time beteen buses to “average” waiting time.

Basic investigation of any missing rows from data pulled by day.


DistinctRowNum_OG <- distinct(select(WaitData_DayPull,
                                     RowNum_OG
                                    )
                             )

str(DistinctRowNum_OG)

View(
anti_join(Samp,
          DistinctRowNum_OG,
          by = c("RowNum_OG" = "RowNum_OG")
         )
)


# The samp time is AFTER the last bus passed that StopID_Clean
View(filter(Samp,
            Event_Time > "2016-10-07 19:48:41" &
              Route == "X2" &
              StopID_Clean == 1003774
           )
    )

# Next Bus (NB) can be on the next morning
View(filter(Testing7,
            SampTime > "2016-10-06 23:58:00" &
              SampTime < "2016-10-06 23:59:59")
    )

Waiting time analyses.

Munging and sampling data to go from time beteen buses to “average” waiting time.

(Pulls here are done by groupings of bus routes, as the data are too large to do at once.)

First, we need to find the most common bus routes.

rm(DistinctRowNum_OG)
object 'DistinctRowNum_OG' not found
# View(head(NewTravTime, 500))
set.seed(123456789)
BusGroups <- group_by(NewTravTime,
                      Route
                     ) %>% 
  summarise(Cnt_Num = n(),
            Cnt_Pct = Cnt_Num / nrow(NewTravTime)
           ) %>% 
  arrange(desc(Cnt_Num)
         ) %>% 
  mutate(RowNum = row_number(),
         RandNum = runif(n = 268),
         RouteGroup = ifelse(RandNum <= 0.2,
                             1,
                      ifelse(RandNum <= 0.4,
                             2,
                      ifelse(RandNum <= 0.6,
                             3,
                      ifelse(RandNum <= 0.8,
                             4,
                             5
                            ))))
        )
str(BusGroups)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame':   268 obs. of  6 variables:
 $ Route     : chr  "70" "W4" "B2" "S2" ...
 $ Cnt_Num   : int  48269 47672 43173 42934 41462 38968 38566 37761 37718 36524 ...
 $ Cnt_Pct   : num  0.0172 0.017 0.0154 0.0153 0.0148 ...
 $ RowNum    : int  1 2 3 4 5 6 7 8 9 10 ...
 $ RandNum   : num  0.693 0.673 0.654 0.719 0.922 ...
 $ RouteGroup: num  4 4 4 4 5 5 2 5 2 3 ...
View(BusGroups)
summary(BusGroups)
    Route              Cnt_Num         Cnt_Pct              RowNum      
 Length:268         Min.   :    4   Min.   :1.424e-06   Min.   :  1.00  
 Class :character   1st Qu.: 2640   1st Qu.:9.396e-04   1st Qu.: 67.75  
 Mode  :character   Median : 7358   Median :2.619e-03   Median :134.50  
                    Mean   :10483   Mean   :3.731e-03   Mean   :134.50  
                    3rd Qu.:17014   3rd Qu.:6.056e-03   3rd Qu.:201.25  
                    Max.   :48269   Max.   :1.718e-02   Max.   :268.00  
    RandNum           RouteGroup   
 Min.   :0.001084   Min.   :1.000  
 1st Qu.:0.255701   1st Qu.:2.000  
 Median :0.512479   Median :3.000  
 Mean   :0.501473   Mean   :3.022  
 3rd Qu.:0.756575   3rd Qu.:4.000  
 Max.   :0.997351   Max.   :5.000  

Waiting time analyses.

Munging and sampling data to go from time beteen buses to “average” waiting time.

(Pulls here are done by groupings of bus routes, as the data are too large to do at once.)

str(WaitData_RoutePull)
'data.frame':   2780848 obs. of  22 variables:
 $ RowNum_OG         : int  771269 510393 842137 416282 403679 478483 842251 403790 842364 403906 ...
 $ Route             : chr  "10A" "10A" "10A" "10A" ...
 $ RouteGroup        : num  4 4 4 4 4 4 4 4 4 4 ...
 $ Event_Time_Date   : int  3 3 3 3 3 3 3 3 3 3 ...
 $ StopID_Clean      : chr  "2" "2" "2" "2" ...
 $ Event_Type        : int  3 4 3 3 4 3 3 4 3 4 ...
 $ Event_Description : Factor w/ 3 levels "Serviced Stop                                     ",..: 1 3 1 1 3 1 1 3 1 3 ...
 $ Event_Time_Yr     : int  2016 2016 2016 2016 2016 2016 2016 2016 2016 2016 ...
 $ Event_Time_Mth    : int  10 10 10 10 10 10 10 10 10 10 ...
 $ Event_Time_Day    : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tues"<..: 2 2 2 2 2 2 2 2 2 2 ...
 $ Event_Time_Hr     : int  0 1 5 5 6 6 7 8 9 10 ...
 $ Event_Time_HrGroup: Ord.factor w/ 8 levels "Group0_2"<"Group3_5"<..: 1 1 2 2 3 3 3 3 4 4 ...
 $ Event_Time_Min    : int  1 3 5 38 21 40 12 36 21 27 ...
 $ Event_Time        : POSIXct, format: "2016-10-03 00:01:53" "2016-10-03 01:03:51" ...
 $ MinTime           : POSIXct, format: "2016-10-03 00:00:19" "2016-10-03 00:00:19" ...
 $ MaxTime           : POSIXct, format: "2016-10-03 23:57:27" "2016-10-03 23:57:27" ...
 $ SampTime          : POSIXct, format: "2016-10-03 08:00:38" "2016-10-03 02:41:02" ...
 $ NB                : POSIXct, format: "2016-10-03 08:36:07" "2016-10-03 05:05:41" ...
 $ WaitTime_Min      : num  35.47 2.41 22.98 4.94 7.99 ...
 $ WaitTime_Sec      : num  2128 145 1379 296 480 ...
 $ WaitTime_Sec2     :Class 'difftime'  atomic [1:2780848] 2128 8679 1379 296 480 ...
  .. ..- attr(*, "units")= chr "secs"
 $ WaitTime_Min2     :Class 'difftime'  atomic [1:2780848] 35.47 144.65 22.98 4.94 7.99 ...
  .. ..- attr(*, "units")= chr "secs"

Waiting time analyses.

Munging and sampling data to go from time beteen buses to “average” waiting time.

Compare WaitData pulled by day and pulled by route.

WaitData_Diff <- anti_join(WaitData_RoutePull,
                           WaitData_DayPull,
                           by = c("RowNum_OG" = "RowNum_OG"
                                 )
                          ) %>% 
  select(-WaitTime_Min,
         -WaitTime_Sec
        )
str(WaitData_Diff)
'data.frame':   130807 obs. of  20 variables:
 $ RowNum_OG         : int  2902760 2952760 2637547 1771590 2911289 1129658 1780069 1729217 1273777 2950017 ...
 $ Route             : chr  "Z8" "Z8" "Z8" "Z8" ...
 $ RouteGroup        : num  1 1 1 1 1 1 1 1 1 1 ...
 $ Event_Time_Date   : int  7 7 6 5 7 6 6 6 3 7 ...
 $ StopID_Clean      : chr  "2005465" "2005465" "2005465" "2005465" ...
 $ Event_Type        : int  3 3 4 3 3 4 3 4 4 3 ...
 $ Event_Description : Factor w/ 3 levels "Serviced Stop                                     ",..: 1 1 3 1 1 3 1 3 3 1 ...
 $ Event_Time_Yr     : int  2016 2016 2016 2016 2016 2016 2016 2016 2016 2016 ...
 $ Event_Time_Mth    : int  10 10 10 10 10 10 10 10 10 10 ...
 $ Event_Time_Day    : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tues"<..: 6 6 5 4 6 5 5 5 2 6 ...
 $ Event_Time_Hr     : int  19 10 15 17 17 21 17 8 8 18 ...
 $ Event_Time_HrGroup: Ord.factor w/ 8 levels "Group0_2"<"Group3_5"<..: 7 4 6 6 6 8 6 3 3 7 ...
 $ Event_Time_Min    : int  51 18 49 55 40 8 7 17 10 7 ...
 $ Event_Time        : POSIXct, format: "2016-10-07 19:51:47" "2016-10-07 10:18:57" ...
 $ MinTime           : POSIXct, format: "2016-10-07 00:00:07" "2016-10-07 00:00:07" ...
 $ MaxTime           : POSIXct, format: "2016-10-07 23:59:55" "2016-10-07 23:59:55" ...
 $ SampTime          : POSIXct, format: "2016-10-07 04:55:14" "2016-10-07 21:41:22" ...
 $ NB                : POSIXct, format: "2016-10-07 05:12:42" "2016-10-07 21:45:51" ...
 $ WaitTime_Sec2     :Class 'difftime'  atomic [1:130807] 1048 269 10822 465 1354 ...
  .. ..- attr(*, "units")= chr "secs"
 $ WaitTime_Min2     :Class 'difftime'  atomic [1:130807] 17.46 4.48 180.36 7.75 22.56 ...
  .. ..- attr(*, "units")= chr "secs"
View(head(WaitData_Diff, 500))

Waiting time analyses.

Munging and sampling data to go from time beteen buses to “average” waiting time.

Compare WaitData (pulled by route) and original data (NewTravTime).

filter(Samp,
              Route == "X2" &
              StopID_Clean == 1003774
            # RowNum_OG = 1146723
            # Event_Time = 2016-10-07 15:32:18
           )
Error in filter_(.data, .dots = lazyeval::lazy_dots(...)) : 
  object 'Samp' not found

Clean up the data a bit.

rm(Compare_NTT_WD)
str(WaitTime_AsNum)
'data.frame':   2810109 obs. of  64 variables:
 $ RowNum_OG                      : int  771269 510393 842137 416282 403679 478483 842251 403790 842364 403906 ...
 $ UniqueLatLng                   : chr  "38.867313__-77.053574" "38.867313__-77.053574" "38.867313__-77.053574" "38.867313__-77.053574" ...
 $ group                          : Factor w/ 5 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ StartStop_ID                   : chr  "6000273--2" "6000273--2" "6000273--2" "6000273--2" ...
 $ BusDay_EventNum                : int  2 70 55 55 55 94 164 158 272 266 ...
 $ Bus_ID                         : int  2915 2719 2950 2634 2625 2674 2950 2625 2950 2625 ...
 $ Route                          : chr  "10A" "10A" "10A" "10A" ...
 $ RteChange2                     : Factor w/ 2 levels "Change","Same": 2 2 2 2 2 2 2 2 2 2 ...
 $ RouteAlt                       : Factor w/ 14 levels "1","10","11",..: 6 6 6 6 6 6 6 6 6 6 ...
 $ DirChange2                     : Factor w/ 2 levels "Change","Same": 2 2 2 2 2 2 2 2 2 2 ...
 $ Route_Direction                : Factor w/ 12 levels "","ANTICLKW",..: 7 7 7 7 7 7 7 7 7 7 ...
 $ Stop_Sequence                  : int  55 55 55 55 55 55 55 55 55 55 ...
 $ Start_ID                       : chr  "6000273" "6000273" "6000273" "6000273" ...
 $ Start_Desc                     : chr  "ARMY-NAVY DR + S HAYES ST" "ARMY-NAVY DR + S HAYES ST" "ARMY-NAVY DR + S HAYES ST" "ARMY-NAVY DR + S HAYES ST" ...
 $ StopID_Clean                   : chr  "2" "2" "2" "2" ...
 $ StopID_Indicator               : Factor w/ 2 levels "ID_Bad","ID_OK": 1 1 1 1 1 1 1 1 1 1 ...
 $ Stop_Desc                      : chr  "PENTAGON INBOUND STOP" "PENTAGON INBOUND STOP" "PENTAGON INBOUND STOP" "PENTAGON INBOUND STOP" ...
 $ countryCode                    : Factor w/ 1 level "US": 1 1 1 1 1 1 1 1 1 1 ...
 $ Stop_State                     : Factor w/ 3 levels "DC","MD","VA": 3 3 3 3 3 3 3 3 3 3 ...
 $ Stop_County                    : Factor w/ 11 levels "Anne Arundel",..: 2 2 2 2 2 2 2 2 2 2 ...
 $ Stop_City                      : Factor w/ 56 levels "Accokeek","Alexandria",..: 4 4 4 4 4 4 4 4 4 4 ...
 $ Stop_Zip                       : Factor w/ 153 levels "20001","20002",..: 132 132 132 132 132 132 132 132 132 132 ...
 $ Event_Type                     : int  3 4 3 3 4 3 3 4 3 4 ...
 $ Event_Description              : Factor w/ 3 levels "Serviced Stop                                     ",..: 1 3 1 1 3 1 1 3 1 3 ...
 $ Event_Time_Yr                  : int  2016 2016 2016 2016 2016 2016 2016 2016 2016 2016 ...
 $ Event_Time_Mth                 : int  10 10 10 10 10 10 10 10 10 10 ...
 $ Event_Time_Date                : int  3 3 3 3 3 3 3 3 3 3 ...
 $ Event_Time_Day                 : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tues"<..: 2 2 2 2 2 2 2 2 2 2 ...
 $ Event_Time_Hr                  : int  0 1 5 5 6 6 7 8 9 10 ...
 $ Event_Time_HrGroup             : Ord.factor w/ 8 levels "Group0_2"<"Group3_5"<..: 1 1 2 2 3 3 3 3 4 4 ...
 $ Event_Time_Min                 : int  1 3 5 38 21 40 12 36 21 27 ...
 $ Event_Time                     : POSIXct, format: "2016-10-03 00:01:53" "2016-10-03 01:03:51" ...
 $ Departure_Time                 : POSIXct, format: "2016-10-03 00:01:53" "2016-10-03 01:03:51" ...
 $ Dwell_Time                     : int  0 0 0 0 0 0 0 0 0 0 ...
 $ Dwell_Time2                    : num  0 0 0 0 0 0 0 0 0 0 ...
 $ Delta_Time                     : int  -210 -89 -35 149 914 253 217 1267 400 900 ...
 $ Latitude                       : num  38.9 38.9 38.9 38.9 38.9 ...
 $ Longitude                      : num  -77.1 -77.1 -77.1 -77.1 -77.1 ...
 $ Heading                        : int  23 23 23 23 23 23 23 23 23 23 ...
 $ Odometer_Distance              : int  1131407 909311 87585 80914 88439 69784 211146 212739 336615 337781 ...
 $ Odometer_Distance_Lag1         : int  1131407 908412 87585 80914 85325 69784 211146 211995 336615 337065 ...
 $ Odometer_Distance_Mi           : num  214.3 172.2 16.6 15.3 16.7 ...
 $ TravelDistance_Ft              : int  NA 899 NA NA 3114 NA NA 744 NA 716 ...
 $ TravelDistance_Mi              : num  NA 0.17 NA NA 0.59 ...
 $ TravelDistance_Mi_Hvrs         : num  0.322 0.322 0.322 0.322 0.319 ...
 $ TravelTime_Sec                 : num  94 191 130 85 183 114 183 183 124 128 ...
 $ TravelTime_Hr                  : num  0.0261 0.0531 0.0361 0.0236 0.0508 ...
 $ SpeedAvg_Mph                   : num  NA 3.21 NA NA 11.6 ...
 $ TravelDistance_Mi_New          : num  0.322 0.17 0.322 0.322 0.59 ...
 $ TravelDistance_Mi_New_Label    : Factor w/ 5 levels "TD_Mi_SS_Mean",..: 5 4 5 5 4 5 5 4 5 4 ...
 $ TravelDistance_Mi_NewHvrs      : num  0.322 0.17 0.322 0.322 0.59 ...
 $ TravelDistance_Mi_NewHvrs_Label: Factor w/ 5 levels "TD_Mi_SS_Mean",..: 5 4 5 5 4 5 5 4 5 4 ...
 $ SpeedAvg_Mph_NewHvrs           : num  12.32 3.21 8.91 13.62 11.6 ...
 $ TT_Sec_New                     : num  94 191 130 85 183 114 183 183 124 128 ...
 $ TT_Sec_New_Label               : Factor w/ 4 levels "TravelTime_Sec",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ TT_Hr_New                      : num  0.0261 0.0531 0.0361 0.0236 0.0508 ...
 $ RouteGroup                     : num  4 4 4 4 4 4 4 4 4 4 ...
 $ MinTime                        : POSIXct, format: "2016-10-03 00:00:19" "2016-10-03 00:00:19" ...
 $ MaxTime                        : POSIXct, format: "2016-10-03 23:57:27" "2016-10-03 23:57:27" ...
 $ SampTime                       : POSIXct, format: "2016-10-03 08:00:38" "2016-10-03 02:41:02" ...
 $ NB                             : POSIXct, format: "2016-10-03 08:36:07" "2016-10-03 05:05:41" ...
 $ WaitTime_Sec2                  : num  2128 8679 1379 296 480 ...
 $ WaitTime_Min2                  : num  35.47 144.65 22.98 4.94 7.99 ...
 $ RouteStop_ID                   : Factor w/ 20897 levels "10A__2","10A__3",..: 1 1 1 1 1 1 1 1 1 1 ...

General exploration of wait times.

filter(WaitTime_AsNum,
            Route == "D8" &  # route has VERY limited service after midnight
              StopID_Clean == 1001669
            # Event_Time = 2016-10-06 20:31:16
           )
Error in filter_(.data, .dots = lazyeval::lazy_dots(...)) : 
  object 'WaitTime_AsNum' not found

Looks like there might be an issue in wait times when very few Route-Stop combinations are included in the dataset. Let’s explore these.

Histogram of the counts of Route-StopID combinations.

RouteStop_Cnts_Bar <- ggplot(RouteStop_CntOfCnt,
                             aes(x = RouteStop_CntNum,
                                 # y = ..density..
                                 y = RouteStopCnt_CntNum
                                )
                            ) +
  # geom_histogram(binwidth = 5, fill = "lightblue", colour = "grey60", size = 0.2) +
  geom_col(fill = "lightblue", colour = "grey60", size = 0.2) +
  coord_cartesian(xlim = c(0, 500)
                  # ylim = c(0, 0.02)
                 ) +
  labs(title = "Variation in Routes Passing a Specific Stop",
       x = "Occurrences of Route-StopID Combiantions",
       y = "Counts"
      )
RouteStop_Cnts_Bar

Create a new dataset limiting extremely small counts of Route-StopID combinations.

filter(WaitTime_RteCnts,
       RouteStop_CntNum > 60  # 12 passes per day in a 5-day dataset
      ) %>% 
  select(WaitTime_Min2) %>% 
  summary()
 WaitTime_Min2     
 Min.   :   0.000  
 1st Qu.:   7.477  
 Median :  16.512  
 Mean   :  49.070  
 3rd Qu.:  33.572  
 Max.   :2478.582  
 NA's   :16298     
filter(WaitTime_RteCnts,
       WaitTime_Min2 < 180  # probably means that something went wrong
      ) %>% 
  select(WaitTime_Min2) %>% 
  summary()
 WaitTime_Min2    
 Min.   :  0.000  
 1st Qu.:  6.969  
 Median : 15.156  
 Mean   : 25.022  
 3rd Qu.: 28.187  
 Max.   :180.000  

Histogram of all wait times.

WaitTime_AllBus_HistDen <- ggplot(filter(select(WaitTime_RteCnts,
                                                WaitTime_Min2
                                               ),
                                         !is.na(WaitTime_Min2)
                                        ),
                                  aes(x = WaitTime_Min2,
                                      y = ..density..
                                     )
                                ) +
  geom_histogram(binwidth = 5, fill = "lightblue", colour = "grey60", size = 0.2) +
  geom_line(stat = "density", colour = "red") +
  scale_x_continuous(breaks = seq(0, 300, 30)
                    ) +
  coord_cartesian(xlim = c(0, 300),
                  ylim = c(0, 0.035)
                 ) +
  labs(title = "Variation in Wait Time",
       x = "Wait Time (min)",
       y = "Density"
      )
WaitTime_AllBus_HistDen

Box plots for WaitTime (all busses, by Zip Code).

# Count_Values is needed to display the medians on the box plots
BusRoute <- select(WaitTime_RteCnts,
                   Route,
                   WaitTime_Min2,
                   Stop_Zip
                  ) %>% 
  filter(Route == "X2")
CountValues_AllBus_Zip <- ddply(BusRoute,
                                .(Stop_Zip),
                                summarise,
                                Value_Counts = median(WaitTime_Min2, na.rm = TRUE)
                               )
WaitTime_AllBus_Zip_Box <- ggplot(BusRoute,
                                  aes(factor(Stop_Zip),
                                      WaitTime_Min2,
                                      fill = factor(Stop_Zip)
                                     )
                                 ) + 
  geom_boxplot(outlier.colour="red", notch=TRUE, na.rm = TRUE) +
  geom_text(data = CountValues_AllBus_Zip,
            aes(y = Value_Counts,
                label = format(round(Value_Counts, digits = 1),
                               nsmall = 1
                              )
               ),
            size = 3,
            vjust = -0.5
           ) +
  theme(legend.position="none", axis.text.x = element_text(angle=45)) +
  coord_cartesian(# xlim = c(0, 180),
                  ylim = c(0, 45)
                 ) +
  labs(title = "Waiting Time at a Given Stop (for the X2)",
       x = "Zip Code of Destination",
       y = "Waiting Time (min)"
      )
WaitTime_AllBus_Zip_Box

Test investigation of just the X2 Route. Violin plots for time between bus arrivals (by Zip Code).

WaitTime_AllBus_Zip_Violin <- ggplot(BusRoute,
                                     aes(factor(Stop_Zip),
                                         WaitTime_Min2,
                                         fill = factor(Stop_Zip)
                                        )
                                    ) + 
  geom_violin(draw_quantiles = c(0.25, 0.5, 0.75),
              trim = TRUE,
              scale = "count",
              na.rm = TRUE,
              show.legend = NA,
              inherit.aes = TRUE
             ) +
  geom_text(data = CountValues_AllBus_Zip,
            aes(y = Value_Counts,
                label = format(round(Value_Counts, digits = 1),
                               nsmall = 1
                              )
               ),
            size = 3.5,
            vjust = -0.5
           ) +
  theme(legend.position="none", axis.text.x = element_text(angle=45)) +
  coord_cartesian(# xlim = c(0, 180),
                  ylim = c(0, 45)
                 ) +
  labs(title = "Waiting Time at a Given Stop (for the X2)",
       x = "Zip Code of Destination",
       y = "Waiting Time (min)"
      )
TimeBtwEvents_X2_ViolinPlot_z

Box plots for WaitTime (Zip Code, by HourGroupZip).

# Count_Values is needed to display the medians on the box plots
Zip <- select(WaitTime_RteCnts,
              Route,
              WaitTime_Min2,
              Stop_Zip,
              Event_Time_HrGroup
             ) %>% 
  filter(Stop_Zip == 20002)
CountValues_AllBus_HG <- ddply(Zip,
                               .(Event_Time_HrGroup),
                               summarise,
                               Value_Counts = median(WaitTime_Min2,
                                                     na.rm = TRUE
                                                    )
                               )
WaitTime_AllBus_HG_Box <- ggplot(Zip,
                                 aes(factor(Event_Time_HrGroup),
                                     WaitTime_Min2,
                                     fill = factor(Event_Time_HrGroup)
                                    )
                                ) + 
  geom_boxplot(outlier.colour="red", notch=TRUE, na.rm = TRUE) +
  geom_text(data = CountValues_AllBus_HG,
            aes(y = Value_Counts,
                label = format(round(Value_Counts, digits = 1),
                               nsmall = 1
                              )
               ),
            size = 2.5,
            vjust = -0.5
           ) +
  theme(legend.position="none", axis.text.x = element_text(angle=45)) +
  coord_cartesian(# xlim = c(0, 180),
                  ylim = c(0, 45)
                 ) +
  labs(title = "Waiting Time at a Given Stop (for Zip 20002)",
       x = "Hour Group",
       y = "Waiting Time (min)"
      )
  # facet_wrap(~Stop_Zip
  #            # nrow = 5
  #           )
WaitTime_AllBus_HG_Box

Violin plots for WaitTime (Zip Code, by HourGroupZip).

WaitTime_AllBus_HG_Vln <- ggplot(Zip,
                                 aes(factor(Event_Time_HrGroup),
                                     WaitTime_Min2,
                                     fill = factor(Event_Time_HrGroup)
                                    )
                                ) + 
  geom_violin(draw_quantiles = c(0.25, 0.5, 0.75),
              trim = TRUE,
              scale = "count",
              na.rm = TRUE,
              show.legend = NA,
              inherit.aes = TRUE
             ) +
  geom_text(data = CountValues_AllBus_HG,
            aes(y = Value_Counts,
                label = format(round(Value_Counts, digits = 1),
                               nsmall = 1
                              )
               ),
            size = 2.5,
            vjust = -0.5
           ) +
  theme(legend.position="none", axis.text.x = element_text(angle=45)) +
  coord_cartesian(# xlim = c(0, 180),
                  ylim = c(0, 90)
                 ) +
  labs(title = "Waiting Time at a Given Stop (for Zip 20002)",
       x = "Hour Group",
       y = "Waiting Time (min)"
      )
  # facet_wrap(~Stop_Zip
  #            # nrow = 5
  #           )
WaitTime_AllBus_HG_Vln

Box plots for WaitTime (Route, by HourGroupZip).

Violin plots for WaitTime (Zip Code, by HourGroupZip).

X2 Percentiles Line Graph Test.

GET DATA READY FOR SHINY

Shiny_WaitData_Base <- select(WaitTime_RteCnts,
                              Route,
                              Stop_Zip,
                              Event_Time_Date,
                              Event_Time_Day,
                              Event_Time_HrGroup,
                              Event_Time_Hr,
                              Latitude,
                              Longitude,
                              WaitTime_Min2
                             )
Shiny_WaitData_Base$Route <- factor(Shiny_WaitData_Base$Route)
str(Shiny_WaitData_Base)
'data.frame':   2810109 obs. of  9 variables:
 $ Route             : Factor w/ 268 levels "10A","10B","10E",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ Stop_Zip          : Factor w/ 153 levels "20001","20002",..: 132 132 132 132 132 132 132 132 132 132 ...
 $ Event_Time_Date   : int  3 3 3 3 3 3 3 3 3 3 ...
 $ Event_Time_Day    : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tues"<..: 2 2 2 2 2 2 2 2 2 2 ...
 $ Event_Time_HrGroup: Ord.factor w/ 8 levels "Group0_2"<"Group3_5"<..: 1 1 2 2 3 3 3 3 4 4 ...
 $ Event_Time_Hr     : int  0 1 5 5 6 6 7 8 9 10 ...
 $ Latitude          : num  38.9 38.9 38.9 38.9 38.9 ...
 $ Longitude         : num  -77.1 -77.1 -77.1 -77.1 -77.1 ...
 $ WaitTime_Min2     : num  35.47 144.65 22.98 4.94 7.99 ...
write.table(Shiny_WaitData_Base,
            "Shiny_WaitData_Base.txt",
            quote = FALSE,
            sep = "\t",
            row.names = FALSE
           )

Testing mapping functionaltiy

map <- get_map(location = c(lon = -77.03676, lat = 38.89784),
               source = "google",
               # maptype = "roadmap"
               zoom = 12
              )
Source : https://maps.googleapis.com/maps/api/staticmap?center=38.89784,-77.03676&zoom=12&size=640x640&scale=2&maptype=terrain&language=en-EN
ggmap(map) +
  geom_polygon(aes(x = long, y = lat, group = group), 
               data = ggtract,
               colour = 'white', 
               fill = 'black', 
               alpha = .4, 
               size = .3
              )

NA

Investigating TravelTime_Sec.


View(filter(TTLargeRteChng,
            !is.na(TravelTime_Sec) &
              RteChange2 == "Same"
           ) %>% 
       arrange(desc(TravelTime_Sec),
               SpeedAvg_Mph_NewHvrs
              ) %>%
       head(500)
    )


# examples where TravelTime_Sec is small (1 sec) and SpeedAvg_Mph_NewHvrs is large.
View(select(NewTravTime,
            # -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
            -(TD_Mi_q2:TD_Mi_SSHG_Cnt_F),
            -(TT_Hr_q2:TT_Hr_SSHG_Cnt_F)
           ) %>% 
       filter((RowNum_OG >= 2217353 & RowNum_OG <= 2217373) | # 2217363
                (RowNum_OG >= 3090321 & RowNum_OG <= 3090341) | # 3090331
                (RowNum_OG >= 80764 & RowNum_OG <= 80784) | # 80774
                (RowNum_OG >= 33840 & RowNum_OG <= 33860) # 33850
           )
    )






# examples where TravelTime_Sec is large and SpeedAvg_Mph_NewHvrs is small.
View(filter(TTLargeRteChng,
            (RowNum_OG >= 2250290 & RowNum_OG <= 2250310) | # 2250300
              (RowNum_OG >= 867717 & RowNum_OG <= 867737) | # 867727
              (RowNum_OG >= 864379 & RowNum_OG <= 864399) | # 864389
              (RowNum_OG >= 808395 & RowNum_OG <= 808415) # 808405
           )
    )

         
         
# examples where TravelTime_Sec is unusually small (with TravelDistance_Mi values that are large).
View(filter(AllDays_NewTravelDist,
            (RowNum_OG >= 1042228 & RowNum_OG <= 1042248) | # 1042238
                (RowNum_OG >= 53816 & RowNum_OG <= 53836) | # 53826
                (RowNum_OG >= 360571 & RowNum_OG <= 360591) | # 360581
                (RowNum_OG >= 502271 & RowNum_OG <= 502291) # 502281 (can't explian the weird TravelTime_Sec calculation here - it's not even an integer!)
           )
    )

# still trying to explain 502281...on the day of this weirdness, the bus was only in circulation for 4-5 stops (~20 minutes) on that day (Oct 6)
View(filter(AllDays_NewTravelDist,
            Bus_ID == 2711
           )
    )


# exploring large values for TravelTime_Sec
View(filter(AllDays_NewTravelDist,
            TravelTime_Sec == 300
           ) %>% 
       arrange(desc(TravelTime_Sec),
               SpeedAvg_Mph2
              )
    )

# examples where TravelTime_Sec is unusually large (with TravelDistance_Mi values that are small, so SpeedAvg_Mph values are very small).
View(filter(AllDays_NewTravelDist,
            (RowNum_OG >= 2627459 & RowNum_OG <= 2627479) | # 2627469
                (RowNum_OG >= 2193344 & RowNum_OG <= 2193364) | # 2193354
                (RowNum_OG >= 1644123 & RowNum_OG <= 1644143) | # 1644133
                (RowNum_OG >= 869600 & RowNum_OG <= 869620) # 869610
           )
    )

Investigation of SpeedAvg_Mph2

View(Speed_Pctiles): 90% of SpeedAvg_Mph2 are between ~3mph and ~66mph.


Speed_Ntile <- as.data.frame(AllDays_NewTravelDist$SpeedAvg_Mph2) %>% 
  mutate(Pctile = ntile(AllDays_NewTravelDist$SpeedAvg_Mph2, 100),
         MinR = min_rank(AllDays_NewTravelDist$SpeedAvg_Mph2),
         PctR = percent_rank(AllDays_NewTravelDist$SpeedAvg_Mph2),
         PctR_Round = round(PctR, 2)
        ) 

colnames(Speed_Ntile)[1] <- "SpeedAvg_Mph2"
str(Speed_Ntile)

Speed_Ntile_Rows <- nrow(Speed_Ntile)

View(tail(Speed_Ntile, 500))


Speed_Pctiles <- group_by(Speed_Ntile,
                          PctR_Round
                         ) %>% 
  summarise(
    MinSpeedAtPctile = min(SpeedAvg_Mph2),
    CntsAtPctile = n(),
    PctsAtPctile = CntsAtPctile / Speed_Ntile_Rows
  ) %>% 
  mutate(CumSumPAtP = cumsum(PctsAtPctile)
        )

View(Speed_Pctiles)

Investigation of SpeedAvg_Mph2.

Exploring the removal of outlier TravelTime_Sec and TravelDistance_Mi.


summary(select(AllDays_NewTravelDist,
               SpeedAvg_Mph,
               SpeedAvg_Mph2
              )
       )

summary(select(filter(AllDays_NewTravelDist,
                      TravelDistance_Mi > 0.0001893939 & # lowest non-zero percentile
                        TravelDistance_Mi < 1.0812500000 & # 99th percentile
                        TravelTime_Sec > 10.050000 & # 2nd percentile
                        TravelTime_Sec < 293.000000 # 98th percentile
                     ),
               SpeedAvg_Mph,
               SpeedAvg_Mph2
              )
       )

Investigation of SpeedAvg_Mph2.

Histogram of SpeedAvg_Mph2.


Speed_HistDen <- ggplot(filter(AllDays_NewTravelDist,
                               !is.na(SpeedAvg_Mph2)
                              ),
                        aes(x = SpeedAvg_Mph2,
                            y = ..density..
                           )
                       ) +
  geom_histogram(binwidth = 5, fill = "lightblue", colour = "grey60", size = 0.2) +
  geom_line(stat = "density", colour = "red") +
  stat_bin(binwidth = 5,
           geom = "text",
           size = 2.5,
           vjust = 1.5,
           aes(label = format(..count.., big.mark = ",")
              ),
          ) +
  # geom_text(aes(label = format(..count.., big.mark = ",")
  #              ),
  #           size = 3,
  #           nudge_y = (..count.. * 0.1)
  #          ) +
  coord_cartesian(xlim = c(0, 70), ylim = c(0, 0.04)
                 ) +
  #  theme(legend.position="none") +
  labs(title = "Variation in Travel Speed",
       x = "Average Speed (mph)",
       y = "Density"
      )

Speed_HistDen

Investigation of SpeedAvg_Mph2.

Histogram of SpeedAvg_Mph2 after removing outlier TravelTime_Sec and TravelDistance_Mi.


View(TravDistMiNew_Pctiles)
View(TravTimeHr_Pctiles)

SpeedNoOutlier_HistDen <- ggplot(filter(AllDays_NewTravelDist,
                                        !is.na(SpeedAvg_Mph2) &
                                          TravelDistance_Mi_New > 0.077841005 & # 5th percentile
                                          # TravelDistance_Mi_New < 1.0812500000 & # 99th percentile
                                          TravelTime_Sec > 12.100000 # 4th percentile
                                          # TravelTime_Sec < 293.000000 # 98th percentile
                                       ),
                                 aes(x = SpeedAvg_Mph2,
                                     y = ..density..
                                    )
                                ) +
  geom_histogram(binwidth = 5, fill = "lightblue", colour = "grey60", size = 0.2) +
  geom_line(stat = "density", colour = "red") +
  stat_bin(binwidth = 5,
           geom = "text",
           size = 2.5,
           vjust = 1.5,
           aes(label = format(..count.., big.mark = ",")
              ),
          ) +
  # geom_text(aes(label = format(..count.., big.mark = ",")
  #              ),
  #           size = 3,
  #           nudge_y = (..count.. * 0.1)
  #          ) +
  coord_cartesian(xlim = c(0, 70), ylim = c(0, 0.04)
                 ) +
  #  theme(legend.position="none") +
  labs(title = "Variation in Travel Speed",
       subtitle = "(removed low outliers of Travel Distance and Travel Time)",
       x = "Average Speed (mph)",
       y = "Density"
      )

SpeedNoOutlier_HistDen

Investigation of SpeedAvg_Mph2.

New dataset (NoOutliers_TravelDistNTime) when removing outlier low values of TravelDistance_Mi_New and TravelTime_Sec.


View(TravDistMiNew_Pctiles)
View(TravTimeHr_Pctiles)

NoOutliers_TravelDistNTime <- filter(AllDays_NewTravelDist,
                                     TravelDistance_Mi_New > .077841005 & # 5th percentile
                                       # TravelDistance_Mi_New < 1.0812500000 & # 99th percentile
                                       TravelTime_Sec > 12.100000 # 4th percentile
                                       # TravelTime_Sec < 293.000000 # 98th percentile
                                    )

nrow(AllDays_NewTravelDist) - nrow(NoOutliers_TravelDistNTime)

str(NoOutliers_TravelDistNTime)
summary(NoOutliers_TravelDistNTime)

Investigation of SppedAvg_Mph2.

View(Speed_NoOut_Pctiles): Aproximately 90% of SpeedAvg_Mph2 values are between ~4mph and ~56mph.


Speed_NoOut_Ntile <- as.data.frame(NoOutliers_TravelDistNTime$SpeedAvg_Mph2) %>% 
  mutate(Pctile = ntile(NoOutliers_TravelDistNTime$SpeedAvg_Mph2, 100),
         MinR = min_rank(NoOutliers_TravelDistNTime$SpeedAvg_Mph2),
         PctR = percent_rank(NoOutliers_TravelDistNTime$SpeedAvg_Mph2),
         PctR_Round = round(PctR, 2)
        ) 

colnames(Speed_NoOut_Ntile)[1] <- "SpeedAvg_Mph2"
str(Speed_NoOut_Ntile)

Speed_NoOut_Ntile_Rows <- nrow(Speed_NoOut_Ntile)

View(tail(Speed_NoOut_Ntile, 500))


Speed_NoOut_Pctiles <- group_by(Speed_NoOut_Ntile,
                                PctR_Round
                               ) %>% 
  summarise(
    MinSpeedAtPctile = min(SpeedAvg_Mph2),
    CntsAtPctile = n(),
    PctsAtPctile = CntsAtPctile / Speed_NoOut_Ntile_Rows
  ) %>% 
  mutate(CumSumPAtP = cumsum(PctsAtPctile)
        )

View(Speed_NoOut_Pctiles)

Investigation of SppedAvg_Mph2.

Exloring odd/impossible values.


# Exploring when SpeedAvg_Mph2 is NA  --  does not occur at all
nrow(filter(NoOutliers_TravelDistNTime,
            is.na(SpeedAvg_Mph2)
           )
    )


# Exploring when SpeedAvg_Mph2 is zero  --  does not occur at all
nrow(filter(NoOutliers_TravelDistNTime,
            SpeedAvg_Mph2 == 0
           )
    )


# examples where SpeedAvg_Mph2 < 3.2848770
View(filter(AllDays_NewTravelDist,
            SpeedAvg_Mph2 > 0 &
              SpeedAvg_Mph2 < 3.2848770
           ) %>% 
       arrange(SpeedAvg_Mph2)
    )

# examples where SpeedAvg_Mph2 < 3.2848770
View(filter(AllDays_NewTravelDist,
            (RowNum_OG >= 485338 & RowNum_OG <= 485358) | # 485348  --  Extreme travel time, Route Change
                (RowNum_OG >= 346952 & RowNum_OG <= 346972) | # 346962  -- Extreme travel time, Route Change 
                (RowNum_OG >= 70494 & RowNum_OG <= 70514) | # 70504  --  Extreme travel time, Route Change
                (RowNum_OG >= 2051846 & RowNum_OG <= 2051866) # 2051856  --  Extreme travel time, Route Change
           )
    )

Investigation of SpeedAvg_Mph2.

Limit the dataset based on SpeedAvg_Mph2.


NoOutliersSpeed <- filter(NoOutliers_TravelDistNTime,
                          between(SpeedAvg_Mph2,
                                  4.069300, # 5th percentile
                                  56.05651 #95th percentile
                                 )
                          )

nrow(NoOutliers_TravelDistNTime) - nrow(NoOutliersSpeed)

summary(NoOutliersSpeed)

TravelTime now looks like it has some odd values on the high end. So let’s look at those.

View(TravTime_NoOut_Pctiles): Virtually all trips should take less than 5 minutes. (The 99th percentile of of TravelTime is approximately 8 minutes.)


TravTime_NoOut_Ntile <- as.data.frame(NoOutliersSpeed$TravelTime_Hr) %>% 
  mutate(Pctile = ntile(NoOutliersSpeed$TravelTime_Hr, 100),
         MinR = min_rank(NoOutliersSpeed$TravelTime_Hr),
         PctR = percent_rank(NoOutliersSpeed$TravelTime_Hr),
         PctR_Round = round(PctR, 2)
        )

colnames(TravTime_NoOut_Ntile)[1] <- "TravelTime_Hr"
str(TravTime_NoOut_Ntile)

TravTime_NoOut_Ntile_Rows <- nrow(TravTime_NoOut_Ntile)

View(tail(TravTime_NoOut_Ntile, 500))


TravTime_NoOut_Pctiles <- group_by(TravTime_NoOut_Ntile,
                                   PctR_Round
                                  ) %>% 
  summarise(
    MinTravTimeHrAtPctile = min(TravelTime_Hr),
    CntsAtPctile = n(),
    PctsAtPctile = CntsAtPctile / TravTime_NoOut_Ntile_Rows
  ) %>% 
  mutate(CumSumPAtP = cumsum(PctsAtPctile),
         MinTravTimeSecAtPctile = MinTravTimeHrAtPctile * (60 * 60)
        )

View(TravTime_NoOut_Pctiles)

Investigating odd TravelTime_Sec values.

Trips longer than ~8 minutes.


View(filter(NoOutliersSpeed,
            TravelTime_Sec > 491 # min at the 100th percentile
           ) %>% 
       arrange(desc(TravelTime_Sec)
              )
    )

# examples of TravelTime_Sec values that are largest.
View(filter(NoOutliersSpeed,
            (RowNum_OG >= 2071759 & RowNum_OG <= 2071779) | # 2071769  --  results from a route change, and a 3hr+ wait before the new route starts
                (RowNum_OG >= 1473686 & RowNum_OG <= 1473706) | # 1473696  --  results from a route change, and a 3hr wait before the new route starts
                (RowNum_OG >= 1222822 & RowNum_OG <= 1222842) | # 1222832  --  results from a route change, and a 3hr wait before the new route starts
                (RowNum_OG >= 3046089 & RowNum_OG <= 3046109) # 3046099  --  results from a route change, and a 3hr wait before the new route starts
           )
    )


# examples of TravelTime_Sec values that are the smallest of the large.
View(filter(NoOutliersSpeed,
            (RowNum_OG >= 3044689 & RowNum_OG <= 3044709) | # 3044699  --  results from a route change
                (RowNum_OG >= 3022358 & RowNum_OG <= 3022378) | # 3022368  --  results from a route change
                (RowNum_OG >= 2993016 & RowNum_OG <= 2993036) | # 2993026  --  results from a previous route change (change occurred in deleted row)
                (RowNum_OG >= 2683703 & RowNum_OG <= 2683723) # 2683713  --  results from a previous route change (change occurred in deleted row)
           )
    )

Let’s look at the TravelTime_Sec values and route changes (DirChange2).

The 99th percentile of TravelTime_Sec for both, all trips, and just those trips NOT involving route changes (DirChange2 = “Same”), is approximately 5min (300 sec).

Nota Bene: The percentile calculation here is defined slightly different than in most of the above analyses (which get the lowest value in the bin created by 100 ntiles).


summary(select(NoOutliersSpeed,
               TravelTime_Sec
              )
       )

summary(select(filter(NoOutliersSpeed,
                      DirChange2 == "Same"
                     ),
               TravelTime_Sec
              )
       )

summary(select(filter(NoOutliersSpeed,
                      DirChange2 == "Change"
                     ),
               TravelTime_Sec
              )
       )


TravTimeSec_Qtiles_df <- data.frame(PctValue = seq(0, 100, 1),
                                    All = seq(1, 101, 1),
                                    Same = seq(1, 101, 1),
                                    Change = seq(1, 101, 1)
                                   )

TravTimeSec_Qtiles_df[ , 2] <- quantile(select(NoOutliersSpeed,
                                               TravelTime_Sec
                                              ),
                                        probs = seq(0, 1, 0.01),
                                        na.rm = TRUE
                                       )

TravTimeSec_Qtiles_df[ , 3] <- quantile(select(filter(NoOutliersSpeed,
                                                      DirChange2 == "Same"
                                                     ),
                                               TravelTime_Sec
                                              ),
                                        probs = seq(0, 1, 0.01),
                                        na.rm = TRUE
                                       )

TravTimeSec_Qtiles_df[ , 4] <- quantile(select(filter(NoOutliersSpeed,
                                                      DirChange2 == "Change"
                                                     ),
                                               TravelTime_Sec
                                              ),
                                        probs = seq(0, 1, 0.01),
                                        na.rm = TRUE
                                       )

View(TravTimeSec_Qtiles_df)

Limit the dataset now based on TravelTime_Sec.


UpperLimitTravTime <- filter(NoOutliersSpeed,
                             TravelTime_Sec <= 491 # min at the 100th percentile
                             )

nrow(NoOutliersSpeed) - nrow(UpperLimitTravTime)

str(UpperLimitTravTime)

summary(UpperLimitTravTime)

Investigation of Dwell_Time2 (how long the bus is at a stop).

Differences between Dwell_Time (by WMATA) and Dwell_Time2 (by me) appear to be due to switches in RouteAlt. WMATA calculates Dwell_Time by an unknown process. The WMATA calculation is equal to my calculation, except for the records immedaitely before and after a RouteAlt switch (DirChange2).


View(filter(AllDays_NewOrder,
            Dwell_Time != Dwell_Time2
           )
    )


# Examples where the Dwell_Time and Dwell_Time2 are different
View(filter(AllDays_NewOrder,
            ( (RowNum_OG >= 65 & RowNum_OG <= 85) | # 75
                (RowNum_OG >= 162 & RowNum_OG <= 192) | # 172
                (RowNum_OG >= 431952 & RowNum_OG <= 431972) | # 431962
                (RowNum_OG >= 434595 & RowNum_OG <= 434615) # 434605  --  this record is NOT a route switch, but does has a Sequence switch (Me: should there really be a route switch here?)
            )
           )
    )

Investigation of Dwell_Time2 (how long the bus is at a stop).

First, create some “rank” stats. View(DT2_Pctiles): 95% of Dwell_Time2s are <= 23 seconds…but some weird (e.g., nearly 2 hour Dwell_Time2s exist).


DwellTime2_Ntile <- as.data.frame(AllDays_NewOrder$Dwell_Time2) %>% 
  mutate(Pctile = ntile(AllDays_NewOrder$Dwell_Time2, 100),
         MinR = min_rank(AllDays_NewOrder$Dwell_Time2),
         PctR = percent_rank(AllDays_NewOrder$Dwell_Time2),
         PctR_Round = round(PctR, 2)
        ) 

colnames(DwellTime2_Ntile)[1] <- "Dwell_Time2"
str(DwellTime2_Ntile)

DwellTime2_Ntile_Rows <- nrow(DwellTime2_Ntile)

View(tail(DwellTime2_Ntile, 500))


DwellTime2_Pctiles <- group_by(DwellTime2_Ntile,
                               PctR_Round
                              ) %>% 
  summarise(
    MinDwellAtPctile = min(Dwell_Time2),
    CntsAtPctile = n(),
    PctsAtPctile = CntsAtPctile / DwellTime2_Ntile_Rows
  ) %>% 
  mutate(CumSumPAtP = cumsum(PctsAtPctile)
        )

View(DwellTime2_Pctiles)

Investigation of Dwell_Time2 (how long the bus is at a stop).

Histogram of Dwell_Time2.


DwellTime2_HistDen <- ggplot(AllDays_NewOrder, aes(x = Dwell_Time2, y = ..density..)) +
  geom_histogram(binwidth = 1, fill = "lightblue", colour = "grey60", size = 0.2) +
  geom_line(stat = "density", colour = "red") +
  coord_cartesian(xlim = c(1, 25), ylim = c(0, 0.05)
                 ) +
  xlab("Time a Bus Stays at a Stop (sec)") + 
  ylab("Density") + 
  #  theme(legend.position="none") + 
  ggtitle(expression(atop("Variation in How Long a Bus Stays at a Stop"
                          # ,atop(italic("xxxxx"),"")
                         )
                    )
         )

DwellTime2_HistDen

Investigation of Dwell_Time2 (how long the bus is at a stop).

Looking at some weirdly long Dwell_Time2 values.


View(arrange(AllDays_NewOrder,
             desc(Dwell_Time2)
            )
    )


# examples of extremely large Dwell_Time2s
View(filter(AllDays_NewOrder,
            (RowNum_OG >= 292669 & RowNum_OG <= 292689) | # 292679
                (RowNum_OG >= 531057 & RowNum_OG <= 531077) | # 531067
                (RowNum_OG >= 1388627 & RowNum_OG <= 1388647) | # 1388637
                (RowNum_OG >= 1645711 & RowNum_OG <= 1645731) # 1645721
           )
    )


View(filter(AllDays_NewOrder,
            Dwell_Time2 == 0
           )
    )

Investigation of Delta_Time (how early or late the bus is).

View(DT2_Pctiles): 94% of Delta_Time values are between -236 seconds and 1,259 seconds. Roughly 66% of records are within 5 min late and 5 min early…but some weird (e.g., almost 50 minute late or 40 minute early) Delta_Times exist.

Note that Delta_Time is the difference from the scheduled bus arrival. So if two buses are scheduled to arrive at a destination at 10:00pm and 10:20pm, and if the 10:20pm bus has a Delta_Time of 5 minutes, there are 25 minutes between bus arrivals at the stop.

Also note that based on a comment at https://planitmetro.com/2016/11/16/data-download-metrobus-vehicle-location-data/, the Delta_Time values don’t appear to coincide with published bus schedules (e.g., the X2 departing every 8 minutes during peak hours).


DeltTime_Ntile <- as.data.frame(AllDays_NewOrder$Delta_Time) %>% 
  mutate(Pctile = ntile(AllDays_NewOrder$Delta_Time, 100),
         MinR = min_rank(AllDays_NewOrder$Delta_Time),
         PctR = percent_rank(AllDays_NewOrder$Delta_Time),
         PctR_Round = round(PctR, 2)
        ) 

colnames(DeltTime_Ntile)[1] <- "Delta_Time"
str(DeltTime_Ntile)

DeltTime_Ntile_Rows <- nrow(DeltTime_Ntile)

View(tail(DeltTime_Ntile, 500))


DeltTime_Pctiles <- group_by(DeltTime_Ntile,
                             PctR_Round
                            ) %>% 
  summarise(
    MinDeltTimeAtPctile = min(Delta_Time),
    CntsAtPctile = n(),
    PctsAtPctile = CntsAtPctile / DeltTime_Ntile_Rows
  ) %>% 
  mutate(CumSumPAtP = cumsum(PctsAtPctile)
        )

View(DeltTime_Pctiles)
DeltTime_Pctiles

# ~66% of rows are between 5 min late and 5 min early
nrow(filter(AllDays_NewOrder,
            Delta_Time >= -300 &
              Delta_Time <= 300
           )
    ) / nrow(AllDays_NewOrder)


# examples of weird large Delta_Times
View(filter(AllDays_NewOrder,
            Delta_Time < -4202 |
              Delta_Time > 1705
           ) %>% 
       arrange(desc(Delta_Time)
              )
    )

Investigation of Delta_Time (how early or late the bus is).

Delta_Time histogram.


DeltTime_HistDen <- ggplot(AllDays_NewOrder, aes(x = (Delta_Time / 60),
                                                 y = ..density..
                                                )
                          ) +
  geom_histogram(binwidth = (5/60), fill = "lightblue", colour = "grey60", size = 0.2) +
  geom_line(stat = "density", colour = "red") +
  coord_cartesian(xlim = c(-5, 5)) +
  xlab("Bus Lateness (min)") + 
  ylab("Density") + 
  #  theme(legend.position="none") + 
  ggtitle(expression(atop("Variation in How Early/Late a Bus Is",
                          atop(italic("(positive values are late arrivals)"),
                               ""
                              )
                         )
                    )
         )

DeltTime_HistDen

Investigation of Delta_Time (how early or late the bus is).

Delta_Time boxplot.


# Count_Values is needed to display the medians on the box plots
Count_Values <- ddply(AllDays_NewOrder,
                      .(Event_Time_HrGroup),
                      summarise,
                      Value_Counts = median(Delta_Time / 60, na.rm = TRUE)
                     )

DeltTime_BoxPlot <- ggplot(AllDays_NewOrder,
                           aes(factor(Event_Time_HrGroup),
                               Delta_Time / 60,
                               fill = factor(Event_Time_HrGroup)
                              )
                          ) + 
  geom_boxplot(outlier.colour="red", notch=TRUE) + 
  # coord_cartesian(ylim = c(-300, 1200)) +
  coord_cartesian(ylim = c(-5, 20)) +
  geom_text(data = Count_Values,
            aes(y = Value_Counts,
                label = format(round(Value_Counts, digits = 1),
                               nsmall = 1
                              )
               ),
            size = 3,
            vjust = -0.5
           ) +
  xlab("Hour Group") + 
  ylab("Bus Lateness (minutes)") + 
  theme(legend.position="none", axis.text.x = element_text(angle=45)) + 
  #theme(legend.position="right", axis.text.x = element_blank()) + 
  ggtitle(expression(atop("How Early/Late is the Bus (by Hour Group)",
                          atop(italic("(positive values are late arrivals)"),
                               ""
                              )
                         )
                    )
         )

DeltTime_BoxPlot

Investigation of Delta_Time (how early or late the bus is).

Exploring “extreme” Delta_Times. First let’s get some “rank” stats.


View(DeltTime_Pctiles)
DeltTime_Pctiles


DeltTimeAbs_Ntile <- as.data.frame(abs(AllDays_NewOrder$Delta_Time)) %>% 
  mutate(Pctile = ntile(abs(AllDays_NewOrder$Delta_Time), 100),
         MinR = min_rank(abs(AllDays_NewOrder$Delta_Time)),
         PctR = percent_rank(abs(AllDays_NewOrder$Delta_Time)),
         PctR_Round = round(PctR, 2)
        ) 

colnames(DeltTimeAbs_Ntile)[1] <- "Delta_Time_Abs"
str(DeltTimeAbs_Ntile)

DeltTimeAbs_Ntile_Rows <- nrow(DeltTimeAbs_Ntile)

View(tail(DeltTimeAbs_Ntile, 500))


DeltTimeAbs_Pctiles <- group_by(DeltTimeAbs_Ntile,
                                PctR_Round
                               ) %>% 
  summarise(
    MinDeltTimeAtPctile = min(Delta_Time_Abs),
    CntsAtPctile = n(),
    PctsAtPctile = CntsAtPctile / DeltTime_Ntile_Rows
  ) %>% 
  mutate(CumSumPAtP = cumsum(PctsAtPctile)
        )

View(DeltTimeAbs_Pctiles)
DeltTimeAbs_Pctiles

Investigation of Delta_Time (how early or late the bus is).

Exploring “extreme” Delta_Times. Then let’s calculate the percentage of buses that are 10 minutes (or more) late/early.


HrGroup_DeltaTime_All <- group_by(AllDays_NewOrder,
                                  Event_Time_HrGroup
                                 ) %>% 
  summarise(EventAll_Cnt = n()
           )

str(HrGroup_DeltaTime_All)
View(HrGroup_DeltaTime_All)


HrGroup_DeltaTime_Above10Min <- filter(AllDays_NewOrder,
                                       abs(Delta_Time) >= 600
                                      ) %>% 
  group_by(Event_Time_HrGroup) %>% 
  summarise(EventAbove10_Cnt = n()
           )

str(HrGroup_DeltaTime_Above10Min)
View(HrGroup_DeltaTime_Above10Min)


HrGroup_DeltaTimeCompare <- inner_join(HrGroup_DeltaTime_Above10Min,
                                       HrGroup_DeltaTime_All,
                                       by = c("Event_Time_HrGroup" = "Event_Time_HrGroup")
                                      ) %>% 
  mutate(PctEventsAbove10 = EventAbove10_Cnt / EventAll_Cnt)

View(HrGroup_DeltaTimeCompare)

Investigation of Delta_Time (how early or late the bus is).

Quickly plot these “extreme” Delta_Times.


DeltTime_Above10_Cols <- ggplot(HrGroup_DeltaTimeCompare,
                                aes(factor(Event_Time_HrGroup),
                                    PctEventsAbove10
                                   )
                               ) +
  geom_col(fill = "lightblue", colour = "grey60", size = 0.2) +
  geom_text(aes(label = format(round(PctEventsAbove10, digits = 2),
                               nsmall = 2
                              )
               ),
            size = 3,
            nudge_y = (HrGroup_DeltaTimeCompare$PctEventsAbove10 * -0.1)
           ) +
  # coord_cartesian(xlim = c(-5, 5)) +
  xlab("Hour Group") + 
  ylab("Percent of All Bus Arrivals") +
  theme(legend.position="none", axis.text.x = element_text(angle=45)) +
  ggtitle(expression(atop("When is a Bus 10+ Minutes Late/Early"
                          # ,atop(italic("positive values are late arrivals"),
                          #      ""
                          #     )
                         )
                    )
         )

DeltTime_Above10_Cols

Quick investigation on the relationship between Dwell_Time2 (the time a bus is at a stop) and Delta_Time (how early/late the bus is).

Correlation.


DwellTDeltaT_Corr <- as.matrix(cor(x = AllDays_NewOrder$Dwell_Time2,
                                   y = AllDays_NewOrder$Delta_Time,
                                   use = "pairwise"
                                  )
                               )

DwellTDeltaT_Corr

Quick investigation on the relationship between Dwell_Time2 (the time a bus is at a stop) and Delta_Time (how early/late the bus is).

Next, let’s get a sample of data for plotting. Let’s do this for the full dataset (AllDays_NewOrder).


AllDays_NewOrder_10PctSamp <- sample_frac(AllDays_NewOrder, 0.1) %>% 
  select(Delta_Time,
         Dwell_Time2
        ) %>% 
  mutate(DataSet = "AllData")

str(AllDays_NewOrder_10PctSamp)

Quick investigation on the relationship between Dwell_Time2 (the time a bus is at a stop) and Delta_Time (how early/late the bus is).

Let’s also get a sample of data for plotting, but with a datset that removes outliers.


View(DeltTime_Pctiles)
View(DwellTime2_Pctiles)

AllDays_NewOrder_NoExtremes_10PctSamp <- filter(AllDays_NewOrder,
                                                between(Delta_Time, -402, 1705) & # removes about 2% of Delta_Time values
                                                  between(Dwell_Time2, 1, 63)  # removes about 2% of Dwell_Time2 values
                                               ) %>% 
  sample_frac(0.1) %>% 
  select(Delta_Time,
         Dwell_Time2
        ) %>% 
  mutate(DataSet = "OutliersRemoved")

str(AllDays_NewOrder_NoExtremes_10PctSamp)

Quick investigation on the relationship between Dwell_Time2 (the time a bus is at a stop) and Delta_Time (how early/late the bus is).

Plotting the data from the dataset that does not remove outliers.


DwellTDeltaT_Scatter <- ggplot(AllDays_NewOrder_10PctSamp,
                               aes(Dwell_Time2, Delta_Time)
                              ) +
  geom_point(shape = 1, alpha = 0.5) +
  scale_shape(solid = FALSE) +
  geom_smooth(method = "lm", colour = "red") +
  # xlab("Time at Stop (sec)") + 
  # ylab("Lateness (sec)") +
  annotate(label = lm_eqn(df = AllDays_NewOrder_10PctSamp,
                          y = AllDays_NewOrder_10PctSamp$Delta_Time,
                          x = AllDays_NewOrder_10PctSamp$Dwell_Time2
                         ),
           x = 2200,
           y = 600,
           geom = "text",
           size = 3,
           colour = "red",
           parse = TRUE
          ) +
  labs(title = "Lateness vs Time at Stop",
       subtitle = "(no outliers removed)",
       x = "Time at Stop (sec)",
       y = "Lateness (sec)"
      )
  # ggtitle(expression(atop("Lateness vs Time at Stop"
  #                         ,atop(italic("(no outliers removed)"),
  #                               ""
  #                              )
  #                        )
  #                   )
  #        )
# +
#   geom_jitter()

DwellTDeltaT_Scatter

Quick investigation on the relationship between Dwell_Time2 (the time a bus is at a stop) and Delta_Time (how early/late the bus is).

Plotting the data from the dataset that does remove outliers.


DwellTDeltaT_Scatter_NoExtremes <- ggplot(AllDays_NewOrder_NoExtremes_10PctSamp,
                                          aes(Dwell_Time2, Delta_Time)
                                         ) +
  geom_point(shape = 1, alpha = 0.5) +
  scale_shape(solid = FALSE) +
  geom_smooth(method = "lm", colour = "blue") +
  # xlab("Time at Stop (sec)") + 
  # ylab("Lateness (sec)") +
  annotate(label = lm_eqn(df = AllDays_NewOrder_NoExtremes_10PctSamp,
                          y = AllDays_NewOrder_NoExtremes_10PctSamp$Delta_Time,
                          x = AllDays_NewOrder_NoExtremes_10PctSamp$Dwell_Time2
                         ),
           x = 50,
           y = -475,
           geom = "text",
           size = 3,
           colour = "blue",
           parse = TRUE
          ) +
  labs(title = "Lateness vs Time at Stop",
       subtitle = "(2% of outliers removed)",
       x = "Time at Stop (sec)",
       y = "Lateness (sec)"
      )
  # ggtitle(expression(atop("Lateness vs Time at Stop"
  #                         ,atop(italic("(2% of outliers removed)"),
  #                               ""
  #                              )
  #                        )
  #                   )
  #        )
# +
#   geom_jitter()

DwellTDeltaT_Scatter_NoExtremes

Quick investigation on the relationship between Dwell_Time2 (the time a bus is at a stop) and Delta_Time (how early/late the bus is).

Plotting the data from both datasets together.


CombinedData <- rbind(AllDays_NewOrder_10PctSamp,
                      AllDays_NewOrder_NoExtremes_10PctSamp
                     )

CombinedData$DataSet <- factor(CombinedData$DataSet)

str(CombinedData)


DwellTDeltaT_Scatter_Combined <- ggplot(CombinedData,
                                        aes(x = Dwell_Time2,
                                            y = Delta_Time,
                                            colour = DataSet
                                           )
                                       ) +
  geom_point(shape = 1, alpha = 0.5) +
  scale_shape(solid = FALSE) +
  coord_cartesian(xlim = c(0, 500), ylim = c(-1000, 2000)
                 ) +
  geom_smooth(data = filter(CombinedData,
                            DataSet == "AllData"
                           ),
              method = "lm",
              colour = "red"
             ) +
  geom_smooth(data = filter(CombinedData,
                            DataSet == "OutliersRemoved"
                           ),
              method = "lm",
              colour = "blue"
             ) +
  # facet_wrap( ~ DataSet, ncol = 2) +
  annotate(label = lm_eqn(df = AllDays_NewOrder_10PctSamp,
                          y = AllDays_NewOrder_10PctSamp$Delta_Time,
                          x = AllDays_NewOrder_10PctSamp$Dwell_Time2
                         ),
           x = 300,
           y = -600,
           geom = "text",
           size = 3,
           colour = "red",
           parse = TRUE
          ) +
  annotate(label = lm_eqn(df = AllDays_NewOrder_NoExtremes_10PctSamp,
                          y = AllDays_NewOrder_NoExtremes_10PctSamp$Delta_Time,
                          x = AllDays_NewOrder_NoExtremes_10PctSamp$Dwell_Time2
                         ),
           x = 300,
           y = -800,
           geom = "text",
           size = 3,
           colour = "blue",
           parse = TRUE
          ) +
  theme(legend.position = "bottom") +
  labs(title = "Lateness vs Time at Stop",
       x = "Time at Stop (sec)",
       y = "Lateness (sec)"
      )
  # ggtitle(expression(atop("Lateness vs Time at Stop"
                          # ,atop(italic("2% of outliers removed"),
                          #       ""
                          #      )
         #                 )
         #            )
         # )
# +
#   geom_jitter()

DwellTDeltaT_Scatter_Combined

Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).

---
title: "R Notebook for WMATA Metrobus Data"
output:
  html_notebook: default
  html_document: default
---

This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook for analysis using data on the DC Bus System (WMATA Metrobus).  The data were obtained here:

https://planitmetro.com/2016/11/16/data-download-metrobus-vehicle-location-data/

Control + Alt + Shift + m = rename in scope

Load the packages to be used.
```{r echo = FALSE, message = FALSE}

library("jsonlite")
library("sqldf")
library("tcltk")
library("tidyr")
library("plyr")
library("dplyr")
library("magrittr")
library("stringr")
library("data.table")
library("lubridate")
library("geosphere")
library("ggplot2")
library("ggmap")
library("ggvis")
library("rbokeh")
library("rgdal")
library("maptools")

```


Get the Bus data.

First let's check the working directory.
```{r}

getwd()

```


Then, actually get the data.
```{r echo = FALSE}

setwd("/Users/mdturse/Desktop/Analytics/DCMetroBus/Bus AVL Oct 2016")

for (i in 3:7){
  assign(paste0("Oct0", i, "Raw"),
         read.delim(paste0("2016100", i, "MetrobusAVL.txt"),
                    sep = "\t",
                    header = TRUE,
                    na.strings = NULL
                   )
        )
  
  message("Oct0", i, "Raw")
 
  str(get(paste0("Oct0", i, "Raw")
         )
     )
  }

```


Put the daily data together.
```{r}

AllDays <- bind_rows(list(Oct03Raw, Oct04Raw, Oct05Raw, Oct06Raw, Oct07Raw),
                     .id = c("group")
                    )
# dim(AllDays)
str(AllDays)

```


Deleting old data frames.
```{r}

for (i in 3:7){
  rm(list = ls(pattern = paste0("Oct0", i, "Raw")
              )
    )
  
  message("Deleting Oct0", i, "Raw")
  }

```


Updating variable types.

Then, sorting the data and adding a RowNumber (to be used for identifying rows later in the analyses.)
```{r}

rm(i)


AllDays$group <- factor(AllDays$group)
AllDays$Route_Direction <- factor(AllDays$Route_Direction)
AllDays$Event_Time <- as.POSIXct(AllDays$Event_Time, format = "%m-%d-%y %I:%M:%S %p")
AllDays$Departure_Time <- as.POSIXct(AllDays$Departure_Time, format = "%m-%d-%y %I:%M:%S %p")

str(AllDays)


AllDays_Sorted <- arrange(AllDays,
                          Bus_ID,
                          Event_Time
                         ) %>% 
  mutate(RowNum_OG = row_number() # this is useful in identify the row later on
        )

rm(AllDays)
str(AllDays_Sorted)

# View(head(AllDays_Sorted, 100))

```


Inspecting the values of Stop_ID, and finding that it can take the values "" (blank) and "NULL".
```{r}

View(group_by(AllDays_Sorted,
              Stop_ID
             ) %>% 
       summarise(
         Cnt = n()
         ) %>% 
       arrange(Stop_ID)
    )

View(filter(AllDays_Sorted,
            is.na(Stop_ID) |
              Stop_ID == "" |
              Stop_ID == "NULL"
           ) %>% 
       arrange(Stop_Desc)
    )

```


Creating a table of distinct Stop_Desc values when Stop_ID is "" (blank) or "NULL".
```{r}

StopID_New <- filter(AllDays_Sorted,
                     is.na(Stop_ID) |
                       Stop_ID == "" |
                       Stop_ID == "NULL"
                    ) %>% 
  select(Stop_ID, Stop_Desc) %>% 
  distinct() %>% 
  arrange(Stop_ID, Stop_Desc) %>% 
  mutate(StopID_New = 1:nrow(.)
        )

View(StopID_New)

```


Creating a full updated table by filling in StopID_New for when Stop_ID is "" (blank) or NULL.
```{r}

AllDays_StopIDNew <- left_join(AllDays_Sorted,
                               select(StopID_New,
                                      Stop_Desc,
                                      StopID_New
                                     ),
                               by = c("Stop_Desc" = "Stop_Desc")
                              ) %>% 
  mutate(StopID_Clean = ifelse(is.na(StopID_New),
                               Stop_ID,
                               StopID_New
                              ),
         StopID_Indicator = factor(ifelse(is.na(StopID_New),
                                          "ID_OK",
                                          "ID_Bad"
                                         )
                                  )
        )

rm(StopID_New)
rm(AllDays_Sorted)
str(AllDays_StopIDNew)

# View(tail(AllDays_StopIDNew, 500))
# View(filter(AllDays_StopIDNew,
#             Stop_Desc == "METROWAY ANNNOUCEMNT CORR"
#            )
#     )

```



Lat Long stats for pulling in Zip codes later.
```{r}

LL_Stats <- group_by(AllDays_StopIDNew,
                     StopID_Clean
                    ) %>% 
  summarise(Lat_Mean = mean(Latitude, na.rm = TRUE),
            Lat_Med = median(Latitude, na.rm = TRUE),
            Lng_Mean = mean(Longitude, na.rm = TRUE),
            Lng_Med = median(Longitude, na.rm = TRUE)
           ) %>% 
  mutate(Lat_MeaLessMed = Lat_Mean - Lat_Med,
         Lng_MeaLessMed = Lng_Mean - Lng_Med,
         RowNum = row_number()
        )

str(LL_Stats)
summary(LL_Stats)

View(head(arrange(LL_Stats,
                  Lat_MeaLessMed
                 ),
          500
         )
    )

View(head(arrange(LL_Stats,
                  desc(Lat_MeaLessMed)
                 ),
          500
         )
    )

View(head(arrange(LL_Stats,
                  Lng_MeaLessMed
                 ),
          500
         )
    )

View(head(arrange(LL_Stats,
                  desc(Lng_MeaLessMed)
                 ),
          500
         )
    )

```


Pulling in Zip Code data from api.geonames.org.
```{r}

# URL EXAMPLE:
# http://api.geonames.org/findNearbyPostalCodesJSON?lat=38.89560&lng=-76.94873&radius=0&username=supermdat

url_1 <- "http://api.geonames.org/findNearbyPostalCodesJSON?lat="
url_2 <- "&lng="
url_3 <- "&radius=0&username="
username <- "supermdat"


# need to group in bunches as http://api.geonames.org limits pulls to 2000 per hour


##### Store everything in multiple lists
pages1 <- list()


system.time(
#   for(j in 0:5){
#   for(k in ((max_row_per_hr*j) + 1):(max_row_per_hr*(j+1)
#                                     )
#      ){
#     
#   }
# }
for(i in 1:1000){
  lat <- filter(LL_Stats,
                RowNum == i
               ) %>%
    select(Lat_Med)
  
  lng <- filter(LL_Stats,
                RowNum == i
               ) %>%
    select(Lng_Med)
  
  APIData1 <- fromJSON(paste0(url_1,
                              lat,
                              url_2,
                              lng,
                              url_3,
                              username
                             ),
                       flatten = TRUE
                      )
  
  message("Retrieving Zip Code ", i)
  
  pages1[[i]] <- APIData1$postalCodes
  
  # Sys.sleep(3900)
}
)

# class(APIData1)
# class(APIData2$postalCodes)
# str(APIData1)
# head(APIData1)
# class(pages1)
# str(pages1)
# nrow(page1)
# ncol(page1)
# pages1[[1199]]
# pages1[[2051]]


##### Combine the lists into one page
Zips1 <- rbind.pages(pages1[sapply(pages1, length) > 0])


##### Combine all pages
Zips_All <- bind_rows(Zips0,
                      Zips1,
                      Zips2,
                      Zips3,
                      Zips4,
                      Zips5,
                      Zips6,
                      Zips7,
                      Zips8,
                      Zips9,
                      Zips10,
                      # Zips1_a,
                      .id = "id"
                     ) %>% 
  mutate(UniqueLatLng = paste(lat, lng, sep = "__")
        )

# str(Zips_All)
# View(head(Zips_All))


# str(LL_Stats)
LL_Stats_UnqLatLng <- mutate(LL_Stats,
                             UniqueLatLng = paste(Lat_Med, Lng_Med, sep = "__")
                            )

# str(LL_Stats_UnqLatLng)
# View(head(LL_Stats_UnqLatLng))


LL_StatsZips <- left_join(LL_Stats_UnqLatLng,
                          Zips_All,
                          by = c("UniqueLatLng" = "UniqueLatLng")
                         )

str(LL_StatsZips)
# View(head(LL_StatsZips))

# Not sure whey these couldn't be found (why they're NA)
View(filter(LL_StatsZips,
            is.na(postalCode)
           )
    )

```


Join to create one dataset that also includes Zip variables.
```{r}

rm(url_1, url_2, url_3, username, pages0, pages1, pages2, pages3, pages4, pages5, pages6, pages7, pages8, pages9, pages10, i, lat, lng, APIData0, APIData1, APIData2, APIData3, APIData4, APIData5, APIData6, APIData7, APIData8, APIData9, APIData10, LL_Stats, LL_Stats_UnqLatLng)


AllDays_Zips <- left_join(AllDays_StopIDNew,
                          LL_StatsZips,
                          by = c("StopID_Clean" = "StopID_Clean")
                         ) %>% 
  rename(Stop_State = adminCode1,
         Stop_County = adminName2,
         Stop_City = placeName,
         Stop_Zip = postalCode
         )

rm(AllDays_StopIDNew, LL_StatsZips)
str(AllDays_Zips)

```


Updating variable types.
```{r}

AllDays_Zips$Stop_State <- factor(AllDays_Zips$Stop_State)
AllDays_Zips$Stop_County <- factor(AllDays_Zips$Stop_County)
AllDays_Zips$Stop_Zip <- factor(AllDays_Zips$Stop_Zip)
AllDays_Zips$Stop_City <- factor(AllDays_Zips$Stop_City)

AllDays_Zips$distance <- as.numeric(AllDays_Zips$distance)
AllDays_Zips$countryCode <- factor(AllDays_Zips$countryCode)
AllDays_Zips$adminName1 <- factor(AllDays_Zips$adminName1)

str(AllDays_Zips)

```


Feature engineering.

Inspecting incidences of consecutive Stop_IDs. This is done because investigation showed that many conseutive events occurr at the same Stop_ID, but with various Dwell_Times, Odometer_Distances, etc.  All of which affect calculations and analyses.

Create data on the runs (consecutive Stop_IDs).
```{r}

StopID_Runs <- rle(AllDays_Zips$StopID_Clean)

StopID_Runs$ends <- cumsum(StopID_Runs$lengths)

StopID_Runs$starts <- ifelse(is.na(lag(StopID_Runs$ends)
                                  ),
                             1,
                             lag(StopID_Runs$ends) + 1
                            )

str(StopID_Runs)
# class(StopID_Runs)
# 
# StopID_Runs_df <- data.frame(unclass(StopID_Runs))
# str(StopID_Runs_df)
# class(StopID_Runs_df)
# rm(StopID_Runs_df)

```


Trying to link data on RunsGroups with the original data (AllDays_Sorted). The goal is to select only one record per RunsGroup - that being the record with the longest Dwell_Time.

I attempted this computation using both data.frames (dplyr) and data.tables (data.table). However, with 2,809,062 rows in one dataset and 3,119,443 rows in the other dataset, the current computation time is over 5 days...so I'm trying a different strategy to only select the first record in a run.
```{r}

# Create a RunsGroup variable for each run
# StopID_Runs_df$RunsGroup <- paste0("g", seq(1:nrow(StopID_Runs_df)
#                                            )
#                                   )
# 
# str(StopID_Runs_df)
# head(StopID_Runs_df, 25)
# tail(StopID_Runs_df, 25)
# 
# StopID_Runs_df <- StopID_Runs_df %>% 
#   mutate(RowNum = row_number()
#         )
# 
# str(StopID_Runs_df)
# head(StopID_Runs_df, 25)
# tail(StopID_Runs_df, 25)
# 
# 
# # Converting to data.tables for, hopefully, improved performance (speed) in computation
# StopID_Runs_dt <- data.table(StopID_Runs_df)
# setkey(StopID_Runs_dt, RowNum)
# str(StopID_Runs_dt)
# 
# AllDays_Sorted_dt <- data.table(AllDays_Sorted)
# setkey(AllDays_Sorted_dt, RowNum_OG)
# str(AllDays_Sorted_dt)
# # rm(AllDays_Sorted_dt)
# 
# 
# # Actual loop to perform the computations and link to original data (AllDays_Sorted_dt)
# GroupData <- list()
# for(i in 1:nrow(StopID_Runs_dt)
#    ) {
#   assign(paste0("group_", i),
#            StopID_Runs_dt[RowNum == i, RunsGroup]
#           )
# 
#     #####  The code below is the same code as above, but done with dplyr  #####
# 
#     # assign(paste0("group_", i),
#   #        filter(StopID_Runs_df,
#   #               RowNum == i
#   #              ) %>% 
#   #          select(RunsGroup)
#   #       )
# 
#   assign(paste0("group_", i, "_start"),
#          StopID_Runs_dt[RowNum == i, starts]
#         )
# 
#   assign(paste0("group_", i, "_end"),
#          StopID_Runs_dt[RowNum == i, ends]
#         )
# 
#   assign(paste0("group_", i, "_rows"),
#          AllDays_Sorted_dt[RowNum_OG >= as.numeric(get(paste0("group_", i, "_start")
#                                                       )
#                                                   ) &
#                            RowNum_OG <= as.numeric(get(paste0("group_", i, "_end")
#                                                       )
#                                                   ),
#                            RunsGroup := as.character(get(paste0("group_", i)
#                                                         )
#                                                     )
#                           ]
# 
#     #####  The code below is the same as the code above, but done with dplyr  #####
# 
#          # filter(AllDays_Sorted,
#          #        between(RowNum_OG,
#          #                as.numeric(get(paste0("group_", i, "_start")
#          #                              )
#          #                          ),
#          #                as.numeric(get(paste0("group_", i, "_end")
#          #                              )
#          #                          )
#          #               )
#          #       ) %>% 
#          #   mutate(RunsGroup = as.character(get(paste0("group_", i)
#          #                                     )
#          #                                 )
#          #        )
#         )
# 
#   GroupData[[i]] <- get(paste0("group_", i, "_rows"))
# 
#   message("Processing Group ", i, " of 2,809,062")
# }
# 
# 
# GroupData_df <- rbind.fill(GroupData)
# str(GroupData_df)
# head(GroupData_df)
# tail(GroupData_df)
# # rm(GroupData_df)
# 
# 
# group_1
# group_1_start
# group_1_end
# group_1_rows
# group_2_rows
# group_3_rows
# group_50_rows
# str(group_50_rows)
# group_2809062_rows
# GroupData[[1]]
# GroupData[[50]]
# 
# 
# #####  Testing Area (Below)  #####
# #####  Testing Area (Below)  #####
# #####  Testing Area (Below)  #####
# 
# # head(StopID_Runs$starts, 20)
# # head(AllDays_NewOrder$Stop_ID, 20)
# # 
# # 
# # dat <- as.data.frame(c(1,1,7,7,7,9,6,8,2,2,2,1,1,1,1,1))
# # colnames(dat)[1] <- "dat"
# # r <- rle(dat$dat)
# # dat$run <- rep(r$lengths, r$lengths)
# # dat$runLag <- lag(dat$run)
# # dat$cond <- rep(r$values, r$lengths)
# # dat
# # View(dat)

```


When consecutive Stop_ID occurrs, only take the first occurrence. This is done because the computation time to select only the record with the longest Dwell_Time for each run was too long (over 5 days).

This is probably less than ideal with regards to Dwell_Time, but should not make much difference for calculations of travel time, speed, etc.
```{r}

AllDays_FirstStopID <- AllDays_Zips[StopID_Runs$starts, ]

dim(AllDays_Zips)
dim(AllDays_FirstStopID)

nrow(AllDays_Zips) - nrow(AllDays_FirstStopID)

rm(AllDays_Zips, StopID_Runs)
str(AllDays_FirstStopID)

```


Feature engineering.

Creating new variables.
```{r}

AllDays_AddVars <- mutate(AllDays_FirstStopID,
                          Odometer_Distance_Mi = Odometer_Distance / 5280, #5,280 feet in 1 mile
                          Dwell_Time2 = as.numeric(Departure_Time - Event_Time),
                          Event_Time_Yr = as.integer(year(Event_Time)),
                          Event_Time_Mth = as.integer(month(Event_Time)),
                          Event_Time_Date = day(Event_Time),
                          Event_Time_Day = wday(Event_Time, label = TRUE),
                          Event_Time_Hr = hour(Event_Time),
                          Event_Time_Min = minute(Event_Time),
                          Event_Time_HrGroup = factor(ifelse(Event_Time_Hr < 3,
                                                             "Group0_2",
                                                      ifelse(Event_Time_Hr < 6,
                                                             "Group3_5",
                                                      ifelse(Event_Time_Hr < 9,
                                                             "Group6_8",
                                                      ifelse(Event_Time_Hr < 12,
                                                             "Group9_11",
                                                      ifelse(Event_Time_Hr < 15,
                                                             "Group12_14",
                                                      ifelse(Event_Time_Hr < 18,
                                                             "Group15_17",
                                                      ifelse(Event_Time_Hr < 21,
                                                             "Group18_20",
                                                      ifelse(Event_Time_Hr < 24,
                                                             "Group21_23"
                                                            )))))))),
                                                         levels = c("Group0_2",
                                                                    "Group3_5",
                                                                    "Group6_8",
                                                                    "Group9_11",
                                                                    "Group12_14",
                                                                    "Group15_17",
                                                                    "Group18_20",
                                                                    "Group21_23"
                                                                   ),
                                                         ordered = TRUE
                                                     )
                         )

rm(AllDays_FirstStopID)
str(AllDays_AddVars)


# group_by(AllDays_AddVars,
#          Event_Time_HrGroup
#         ) %>% 
#   summarise(Cnts = n()
#            )


# View(head(filter(AllDays_AddVars,
#                  Event_Time_Hr == 0
#                 ),
#           50
#          )
#     )

# View(head(AllDays_AddVars, 50))

```


<!-- Function for calculating the distance traveled based on the Haversine formula.  Original code from: https://www.r-bloggers.com/great-circle-distance-calculations-in-r/ -->
<!-- ```{r} -->

<!-- gcd.hf <- function(long1, lat1, long2, lat2) { -->
<!--   R <- 6371 # Earth mean radius [km] -->
<!--   delta.long <- (long2 - long1) -->
<!--   delta.lat <- (lat2 - lat1) -->
<!--   a <- sin(delta.lat/2)^2 + cos(lat1) * cos(lat2) * sin(delta.long/2)^2 -->
<!--   c <- 2 * asin(min(1,sqrt(a))) -->
<!--   d = R * c * 0.621371 # 1 km = 0.621371 miles -->
<!--   return(d) # Distance in miles -->
<!-- } -->

<!-- ``` -->


Feature engineering.

Creating more variables. Creating a BusEvent row number for future identification purposes. Then, creating various variables to analyze distance traveled and speed.
```{r}

AllDays_BusDay <- group_by(AllDays_AddVars,
                           Bus_ID,
                           Event_Time_Date
                          ) %>% 
  mutate(BusDay_EventNum = row_number(),  # used to identify Bus movements on a particular date
         
         Route_Lag1 = lag(Route),  # used in future analyses to identify Route changes
         RouteAlt_Lag1 = lag(RouteAlt),  # used in future analyses to identify RouteAlt (direction) changes
         
         Odometer_Distance_Lag1 = lag(Odometer_Distance),
         
         Latitude_L1 = lag(Latitude),
         Longitude_L1 = lag(Longitude),
         # Lat_Radian = Latitude*pi/180,
         # Long_Radian = Longitude*pi/180,
         # Lat_Radian_L1 = lag(Lat_Radian),
         # Long_Radian_L1 = lag(Long_Radian),
         
         # accounting for potential negative distances
         TravelDistance_Ft = ifelse(Odometer_Distance > Odometer_Distance_Lag1,
                                    Odometer_Distance - Odometer_Distance_Lag1,
                                    NA
                                   ),
         TravelDistance_Mi = TravelDistance_Ft / 5280, #5,280 feet in 1 mile
         
         # TravelDistance_Mi2 = gcd.hf(long1 = Long_Radian_L1,
         #                             lat1 = Lat_Radian_L1,
         #                             long2 = Long_Radian,
         #                             lat2 = Lat_Radian
         #                            ),
         
         TravelDistance_Mi_Hvrs = 
                              # ifelse((is.na(Longitude_L1) | is.na(Latitude_L1)
                              #        ),
                              #        NA,
                              distHaversine(cbind(Longitude_L1, Latitude_L1),
                                            cbind(Longitude, Latitude)
                                           ) * 0.000621371, # 0.000621371 miles = 1 meter
         
         # accounting for potential negative times
         TravelTime_Sec = as.numeric(ifelse(Event_Time > lag(Departure_Time),
                                            Event_Time - lag(Departure_Time),
                                            NA
                                           )
                                    ),
         TravelTime_Hr = TravelTime_Sec / 3600, # 3,600 seconds in 1 hour
         
         # accounting for potential negative or zero travel times
         SpeedAvg_Mph = ifelse(TravelTime_Hr > 0,
                               TravelDistance_Mi / TravelTime_Hr,
                               NA
                              ),
         
         Start_ID = lag(StopID_Clean),
         Start_Desc = lag(Stop_Desc),
         StartStop_ID = ifelse(is.na(Start_ID),
                               paste("NULL", StopID_Clean, sep = "--"),
                               paste(Start_ID, StopID_Clean, sep = "--")
                              )
        ) %>% 
  as.data.frame()


rm(AllDays_AddVars)
str(AllDays_BusDay)

# summary(AllDays_BusDay)

# View(tail(AllDays_BusDay, 50))

```


Inspecting for issues with StartStop_ID (where the value is either NA or contains NULL). They ONLY exist when BusDay_EventNum = 1 (which is by design). So everything looks OK.
```{r}

View(group_by(AllDays_BusDay,
              StartStop_ID
             ) %>% 
       summarise(
         Cnt = n()
       ) %>% 
       arrange(desc(Cnt)
              )
    )

View(filter(AllDays_BusDay,
            (is.na(StartStop_ID) |
              str_detect(StartStop_ID, "NULL")
            ) &
              BusDay_EventNum != 1
           )
    )

```


Stats (quantiles) overall for TravelDistance_Mi.
```{r}

Quantiles_dt <- AllDays_BusDay %>% 
  mutate(TD_Mi_q2 = quantile(x = TravelDistance_Mi, probs = 0.02, na.rm = TRUE),
         TD_Mi_q98 = quantile(x = TravelDistance_Mi, probs = 0.98, na.rm = TRUE),
         TT_Sec_q2 = quantile(x = TravelTime_Sec, probs = 0.02, na.rm = TRUE),
         TT_Sec_q98 = quantile(x = TravelTime_Sec, probs = 0.98, na.rm = TRUE),
         TT_Hr_q2 = quantile(x = TravelTime_Hr, probs = 0.02, na.rm = TRUE),
         TT_Hr_q98 = quantile(x = TravelTime_Hr, probs = 0.98, na.rm = TRUE)
        ) %>% 
  data.table()


Stats <- Quantiles_dt %>% 
  mutate(TD_Mi_Mean = mean(TravelDistance_Mi, na.rm = TRUE),
         TD_Mi_Mean_F = mean(TravelDistance_Mi[TD_Mi_q2 <= TravelDistance_Mi & TravelDistance_Mi <= TD_Mi_q98],
                             na.rm = TRUE
                            ),
         TD_Mi_Med = median(TravelDistance_Mi, na.rm = TRUE),
         TD_Mi_Med_F = median(TravelDistance_Mi[TD_Mi_q2 <= TravelDistance_Mi & TravelDistance_Mi <= TD_Mi_q98],
                              na.rm = TRUE
                             ),
         TD_Mi_Cnt = sum(!is.na(TravelDistance_Mi)
                        ),
         TD_Mi_Cnt_F = sum(!is.na(TravelDistance_Mi[TD_Mi_q2 <= TravelDistance_Mi & TravelDistance_Mi <= TD_Mi_q98]
                                 )
                          ),
            
         TT_Sec_Mean = mean(TravelTime_Sec, na.rm = TRUE),
         TT_Sec_Mean_F = mean(TravelTime_Sec[TT_Sec_q2 <= TravelTime_Sec & TravelTime_Sec <= TT_Sec_q98],
                              na.rm = TRUE
                             ),
         TT_Sec_Med = median(TravelTime_Sec, na.rm = TRUE),
         TT_Sec_Med_F = median(TravelTime_Sec[TT_Sec_q2 <= TravelTime_Sec & TravelTime_Sec <= TT_Sec_q98],
                               na.rm = TRUE
                              ),
         TT_Sec_Cnt = sum(!is.na(TravelTime_Sec)
                         ),
         TT_Sec_Cnt_F = sum(!is.na(TravelTime_Sec[TT_Sec_q2 <= TravelTime_Sec & TravelTime_Sec <= TT_Sec_q98]
                                   )
                           ),

         TT_Hr_Mean = mean(TravelTime_Hr, na.rm = TRUE),
         TT_Hr_Mean_F = mean(TravelTime_Hr[TT_Hr_q2 <= TravelTime_Hr & TravelTime_Hr <= TT_Hr_q98],
                             na.rm = TRUE
                            ),
         TT_Hr_Med = median(TravelTime_Hr, na.rm = TRUE),
         TT_Hr_Med_F = median(TravelTime_Hr[TT_Hr_q2 <= TravelTime_Hr & TravelTime_Hr <= TT_Hr_q98],
                              na.rm = TRUE
                             ),
         TT_Hr_Cnt = sum(!is.na(TravelTime_Hr)
                        ),
         TT_Hr_Cnt_F = sum(!is.na(TravelTime_Hr[TT_Hr_q2 <= TravelTime_Hr & TravelTime_Hr <= TT_Hr_q98]
                                 )
                          )
        ) %>% 
  data.frame()

rm(AllDays_BusDay)
rm(Quantiles_dt)
str(Stats)
# View(head(Stats, 50))

```


Stats for StartStop_ID.
```{r}

Quantiles_SS_dt <- group_by(Stats,
                            StartStop_ID
                           ) %>% 
  mutate(TD_Mi_SS_q5 = quantile(x = TravelDistance_Mi, probs = 0.05, na.rm = TRUE),
         TD_Mi_SS_q95 = quantile(x = TravelDistance_Mi, probs = 0.95, na.rm = TRUE),
         TT_Sec_SS_q5 = quantile(x = TravelTime_Sec, probs = 0.05, na.rm = TRUE),
         TT_Sec_SS_q95 = quantile(x = TravelTime_Sec, probs = 0.95, na.rm = TRUE),
         TT_Hr_SS_q5 = quantile(x = TravelTime_Hr, probs = 0.05, na.rm = TRUE),
         TT_Hr_SS_q95 = quantile(x = TravelTime_Hr, probs = 0.95, na.rm = TRUE)
        ) %>% 
  data.table()


Stats_StSt <- group_by(Quantiles_SS_dt,
                       StartStop_ID
                      ) %>% 
  mutate(TD_Mi_SS_Mean = mean(TravelDistance_Mi, na.rm = TRUE),
         TD_Mi_SS_Mean_F = mean(TravelDistance_Mi[TD_Mi_SS_q5 <= TravelDistance_Mi & TravelDistance_Mi <= TD_Mi_SS_q95],
                                na.rm = TRUE
                               ),
         TD_Mi_SS_Med = median(TravelDistance_Mi, na.rm = TRUE),
         TD_Mi_SS_Med_F = median(TravelDistance_Mi[TD_Mi_SS_q5 <= TravelDistance_Mi & TravelDistance_Mi <= TD_Mi_SS_q95],
                                 na.rm = TRUE
                                ),
         TD_Mi_SS_Cnt = sum(!is.na(TravelDistance_Mi)
                           ),
         TD_Mi_SS_Cnt_F = sum(!is.na(TravelDistance_Mi[TD_Mi_SS_q5 <= TravelDistance_Mi & TravelDistance_Mi <= TD_Mi_SS_q95]
                                    )
                             ),
            
         TT_Sec_SS_Mean = mean(TravelTime_Sec, na.rm = TRUE),
         TT_Sec_SS_Mean_F = mean(TravelTime_Sec[TT_Sec_SS_q5 <= TravelTime_Sec & TravelTime_Sec <= TT_Sec_SS_q95],
                                 na.rm = TRUE
                                ),
         TT_Sec_SS_Med = median(TravelTime_Sec, na.rm = TRUE),
         TT_Sec_SS_Med_F = median(TravelTime_Sec[TT_Sec_SS_q5 <= TravelTime_Sec & TravelTime_Sec <= TT_Sec_SS_q95],
                                  na.rm = TRUE
                                 ),
         TT_Sec_SS_Cnt = sum(!is.na(TravelTime_Sec)),
         TT_Sec_SS_Cnt_F = sum(!is.na(TravelTime_Sec[TT_Sec_SS_q5 <= TravelTime_Sec & TravelTime_Sec <= TT_Sec_SS_q95]
                                     )
                              ),

         TT_Hr_SS_Mean = mean(TravelTime_Hr, na.rm = TRUE),
         TT_Hr_SS_Mean_F = mean(TravelTime_Hr[TT_Hr_SS_q5 <= TravelTime_Hr & TravelTime_Hr <= TT_Hr_SS_q95],
                                na.rm = TRUE
                               ),
         TT_Hr_SS_Med = median(TravelTime_Hr, na.rm = TRUE),
         TT_Hr_SS_Med_F = median(TravelTime_Hr[TT_Hr_SS_q5 <= TravelTime_Hr & TravelTime_Hr <= TT_Hr_SS_q95],
                                 na.rm = TRUE
                                ),
         TT_Hr_SS_Cnt = sum(!is.na(TravelTime_Hr)),
         TT_Hr_SS_Cnt_F = sum(!is.na(TravelTime_Hr[TT_Hr_SS_q5 <= TravelTime_Hr & TravelTime_Hr <= TT_Hr_SS_q95]
                                    )
                             )
        ) %>% 
  data.frame()

rm(Stats)
rm(Quantiles_SS_dt)
str(Stats_StSt)
# View(head(Stats_StSt, 50))

```


Stats for StartStop_ID with Event_Time_HrGroup.
```{r}

Quantiles_SSHG_dt <- group_by(Stats_StSt,
                              StartStop_ID,
                              Event_Time_HrGroup
                             ) %>% 
  mutate(TD_Mi_SSHG_q5 = quantile(x = TravelDistance_Mi, probs = 0.05, na.rm = TRUE),
         TD_Mi_SSHG_q95 = quantile(x = TravelDistance_Mi, probs = 0.95, na.rm = TRUE),
         TT_Sec_SSHG_q5 = quantile(x = TravelTime_Sec, probs = 0.05, na.rm = TRUE),
         TT_Sec_SSHG_q95 = quantile(x = TravelTime_Sec, probs = 0.95, na.rm = TRUE),
         TT_Hr_SSHG_q5 = quantile(x = TravelTime_Hr, probs = 0.05, na.rm = TRUE),
         TT_Hr_SSHG_q95 = quantile(x = TravelTime_Hr, probs = 0.95, na.rm = TRUE)
        ) %>% 
  data.table()


Stats_StSt_HrGrp <- group_by(Quantiles_SSHG_dt,
                             StartStop_ID,
                             Event_Time_HrGroup
                            ) %>% 
  mutate(TD_Mi_SSHG_Mean = mean(TravelDistance_Mi, na.rm = TRUE),
         TD_Mi_SSHG_Mean_F = mean(TravelDistance_Mi[TD_Mi_SSHG_q5 <= TravelDistance_Mi & TravelDistance_Mi <= TD_Mi_SSHG_q95],
                                  na.rm = TRUE
                                 ),
         TD_Mi_SSHG_Med = median(TravelDistance_Mi, na.rm = TRUE),
         TD_Mi_SSHG_Med_F = median(TravelDistance_Mi[TD_Mi_SSHG_q5 <= TravelDistance_Mi & TravelDistance_Mi <= TD_Mi_SSHG_q95],
                                   na.rm = TRUE
                                  ),
         TD_Mi_SSHG_Cnt = sum(!is.na(TravelDistance_Mi)
                             ),
         TD_Mi_SSHG_Cnt_F = sum(!is.na(TravelDistance_Mi[TD_Mi_SSHG_q5 <= TravelDistance_Mi & TravelDistance_Mi <= TD_Mi_SSHG_q95]
                                      )
                               ),
            
         TT_Sec_SSHG_Mean = mean(TravelTime_Sec, na.rm = TRUE),
         TT_Sec_SSHG_Mean_F = mean(TravelTime_Sec[TT_Sec_SSHG_q5 <= TravelTime_Sec & TravelTime_Sec <= TT_Sec_SSHG_q95],
                                   na.rm = TRUE
                                  ),
         TT_Sec_SSHG_Med = median(TravelTime_Sec, na.rm = TRUE),
         TT_Sec_SSHG_Med_F = median(TravelTime_Sec[TT_Sec_SSHG_q5 <= TravelTime_Sec & TravelTime_Sec <= TT_Sec_SSHG_q95],
                                    na.rm = TRUE
                                   ),
         TT_Sec_SSHG_Cnt = sum(!is.na(TravelTime_Sec)),
         TT_Sec_SSHG_Cnt_F = sum(!is.na(TravelTime_Sec[TT_Sec_SSHG_q5 <= TravelTime_Sec & TravelTime_Sec <= TT_Sec_SSHG_q95]
                                       )
                                ),

         TT_Hr_SSHG_Mean = mean(TravelTime_Hr, na.rm = TRUE),
         TT_Hr_SSHG_Mean_F = mean(TravelTime_Hr[TT_Hr_SSHG_q5 <= TravelTime_Hr & TravelTime_Hr <= TT_Hr_SSHG_q95],
                                  na.rm = TRUE
                                 ),
         TT_Hr_SSHG_Med = median(TravelTime_Hr, na.rm = TRUE),
         TT_Hr_SSHG_Med_F = median(TravelTime_Hr[TT_Hr_SSHG_q5 <= TravelTime_Hr & TravelTime_Hr <= TT_Hr_SSHG_q95],
                                   na.rm = TRUE
                                  ),
         TT_Hr_SSHG_Cnt = sum(!is.na(TravelTime_Hr)),
         TT_Hr_SSHG_Cnt_F = sum(!is.na(TravelTime_Hr[TT_Hr_SSHG_q5 <= TravelTime_Hr & TravelTime_Hr <= TT_Hr_SSHG_q95]
                                      )
                               )
        ) %>% 
  data.frame()

rm(Stats_StSt)
rm(Quantiles_SSHG_dt)
str(Stats_StSt_HrGrp)
# View(head(Stats_StSt_HrGrp, 50))

```


Feature engineering.

Creating a BusEventRoute row number, and a RouteAlt_Lag1 indicator for future identification purposes. 
```{r}

# rm(Quantiles_dt)
# rm(Quantiles_SS_dt)
# rm(AllDays_BusDay)
# rm(Quantiles_SSHG_dt)
# rm(Stats_StSt)

# AllDays_BusDayRoute <- group_by(Stats_StSt_HrGrp,
#                                 Bus_ID,
#                                 Event_Time_Date,
#                                 Route
#                                ) %>% 
#   mutate(RouteAlt_Lag2 = lag(RouteAlt)  # used in future analyses to identify RouteAlt (direction) changes
#          
#          # Odometer_Distance_Lag1 = lag(Odometer_Distance),
#          # 
#          # # accounting for potential negative distances
#          # TravelDistance_Ft = ifelse(Odometer_Distance >= Odometer_Distance_Lag1,
#          #                            Odometer_Distance - Odometer_Distance_Lag1,
#          #                            NA
#          #                           ),
#          # TravelDistance_Mi = TravelDistance_Ft / 5280, #5,280 feet in 1 mile
#          # 
#          # # accounting for potential negative times
#          # TravelTime_Sec = as.numeric(ifelse(Event_Time >= lag(Departure_Time),
#          #                                    Event_Time - lag(Departure_Time),
#          #                                    NA
#          #                                   )
#          #                            ),
#          # TravelTime_Hr = TravelTime_Sec / 3600, # 3,600 seconds in 1 hour
#          # 
#          # # accounting for potential negative or zero travel times
#          # SpeedAvg_Mph = ifelse(TravelTime_Hr > 0,
#          #                       TravelDistance_Mi / TravelTime_Hr,
#          #                       NA
#          #                      )
#         ) %>% 
#   data.frame()
# 
# rm(Stats_StSt_HrGrp)
# str(AllDays_BusDayRoute)

```


Feature engineering.

Calculating a variable to know if the RouteAlt changed. Could be useful in helping identifying weirdness in calculated distances and speeds.
```{r}

# rm(Stats_StSt_HrGrp)

AllDays_DirChange <- Stats_StSt_HrGrp %>%  # AllDays_BusDayRoute %>% 
  mutate(RteChange = ifelse(Route == Route_Lag1,
                            "Same",
                            "Change"
                           ),
         RteChange2 = factor(ifelse(is.na(RteChange),
                                    "Change",
                                    RteChange
                                   )
                            ),
         DirChange = ifelse(RouteAlt == RouteAlt_Lag1,
                            "Same",
                            "Change"
                           ),
         DirChange2 = factor(ifelse(is.na(DirChange),
                                    "Change",
                                    DirChange
                                   )
                            )
        )

# rm(AllDays_BusDayRoute)
rm(Stats_StSt_HrGrp)
str(AllDays_DirChange)

View(filter(AllDays_DirChange,
            between(RowNum_OG, 2570060, 2570080)
           ) %>% 
       select(-matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
             )
    )

```


Re-ordering the variables to ease with comprehension.
```{r}

AllDays_NewOrder <-  select(AllDays_DirChange,
                            RowNum_OG,
                            UniqueLatLng,
                            group,
                            StartStop_ID,
                            BusDay_EventNum,
                            Bus_ID,
                            Route,
                            RteChange2,
                            RouteAlt,
                            # RouteAlt_Lag1,
                            DirChange2,
                            Route_Direction,
                            Stop_Sequence,
                            Start_ID,
                            Start_Desc,
                            # Stop_ID,
                            StopID_Clean,
                            StopID_Indicator,
                            Stop_Desc,
                            countryCode,
                            Stop_State,
                            Stop_County,
                            Stop_City,
                            Stop_Zip,
                            Event_Type,
                            Event_Description,
                            Event_Time_Yr,
                            Event_Time_Mth,
                            Event_Time_Date,
                            Event_Time_Day,
                            Event_Time_Hr,
                            Event_Time_HrGroup,
                            Event_Time_Min,
                            Event_Time,
                            Departure_Time,
                            Dwell_Time,
                            Dwell_Time2,
                            Delta_Time,
                            Latitude,
                            Longitude,
                            Heading,
                            Odometer_Distance,
                            Odometer_Distance_Lag1,
                            Odometer_Distance_Mi,
                            TravelDistance_Ft,
                            TravelDistance_Mi,
                            TravelDistance_Mi_Hvrs,
                            TD_Mi_q2,
                            TD_Mi_q98,
                            TD_Mi_SS_q5,
                            TD_Mi_SS_q95,
                            TD_Mi_SSHG_q5,
                            TD_Mi_SSHG_q95,
                            TD_Mi_Mean,
                            TD_Mi_Mean_F,
                            TD_Mi_SS_Mean,
                            TD_Mi_SS_Mean_F,
                            TD_Mi_SSHG_Mean,
                            TD_Mi_SSHG_Mean_F,
                            TD_Mi_Med,
                            TD_Mi_Med_F,
                            TD_Mi_SS_Med,
                            TD_Mi_SS_Med_F,
                            TD_Mi_SSHG_Med,
                            TD_Mi_SSHG_Med_F,
                            TD_Mi_Cnt,
                            TD_Mi_Cnt_F,
                            TD_Mi_SS_Cnt,
                            TD_Mi_SS_Cnt_F,
                            TD_Mi_SSHG_Cnt,
                            TD_Mi_SSHG_Cnt_F,
                            TravelTime_Sec,
                            TT_Sec_q2,
                            TT_Sec_q98,
                            TT_Sec_SS_q5,
                            TT_Sec_SS_q95,
                            TT_Sec_SSHG_q5,
                            TT_Sec_SSHG_q95,
                            TT_Sec_Mean,
                            TT_Sec_Mean_F,
                            TT_Sec_SS_Mean,
                            TT_Sec_SS_Mean_F,
                            TT_Sec_SSHG_Mean,
                            TT_Sec_SSHG_Mean_F,
                            TT_Sec_Med,
                            TT_Sec_Med_F,
                            TT_Sec_SS_Med,
                            TT_Sec_SS_Med_F,
                            TT_Sec_SSHG_Med,
                            TT_Sec_SSHG_Med_F,
                            TT_Sec_Cnt,
                            TT_Sec_Cnt_F,
                            TT_Sec_SS_Cnt,
                            TT_Sec_SS_Cnt_F,
                            TT_Sec_SSHG_Cnt,
                            TT_Sec_SSHG_Cnt_F,
                            TravelTime_Hr,
                            TT_Hr_q2,
                            TT_Hr_q98,
                            TT_Hr_SS_q5,
                            TT_Hr_SS_q95,
                            TT_Hr_SSHG_q5,
                            TT_Hr_SSHG_q95,
                            TT_Hr_Mean,
                            TT_Hr_Mean_F,
                            TT_Hr_SS_Mean,
                            TT_Hr_SS_Mean_F,
                            TT_Hr_SSHG_Mean,
                            TT_Hr_SSHG_Mean_F,
                            TT_Hr_Med,
                            TT_Hr_Med_F,
                            TT_Hr_SS_Med,
                            TT_Hr_SS_Med_F,
                            TT_Hr_SSHG_Med,
                            TT_Hr_SSHG_Med_F,
                            TT_Hr_Cnt,
                            TT_Hr_Cnt_F,
                            TT_Hr_SS_Cnt,
                            TT_Hr_SS_Cnt_F,
                            TT_Hr_SSHG_Cnt,
                            TT_Hr_SSHG_Cnt_F,
                            SpeedAvg_Mph
                           )

rm(AllDays_DirChange)
str(select(AllDays_NewOrder,
           -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
          )
   )
str(AllDays_NewOrder)

# View(head(AllDays_NewOrder, 500))
# View(tail(AllDays_NewOrder, 500))

```


Summarizing the data to help spot anomolies.
```{r}

View(group_by(AllDays_NewOrder,
              Stop_City) %>% 
       summarise(Cnt_Num = n(),
                 Cnt_Pct = 100*Cnt_Num / (nrow(AllDays_NewOrder)
                                         )
                ) %>% 
       arrange(desc(Cnt_Num))
)

summary(AllDays_NewOrder)

```


Investigation of TravelDistance_Mi.

View(TravDistMi_Pctiles): 99% of TravelDistance_Mi are about 1 mile or less...but some weird TravelDistance_Mi values (e.g., 584 miles traveled) exist.
```{r}

TravDistMi_Ntile <- as.data.frame(AllDays_NewOrder$TravelDistance_Mi) %>% 
  mutate(#Pctile = ntile(AllDays_NewOrder$TravelDistance_Mi, 100),
         #MinR = min_rank(AllDays_NewOrder$TravelDistance_Mi),
         PctR = percent_rank(AllDays_NewOrder$TravelDistance_Mi),
         PctR_Round = round(PctR, 2)
        ) 

colnames(TravDistMi_Ntile)[1] <- "TravelDistance_Mi"
# str(TravDistMi_Ntile)

TravDistMi_Ntile_Rows <- nrow(TravDistMi_Ntile)

# View(tail(TravDistMi_Ntile, 500))


TravDistMi_Pctiles <- group_by(TravDistMi_Ntile,
                               PctR_Round
                              ) %>% 
  summarise(
    MinTravDistMiAtPctile = min(TravelDistance_Mi),
    CntsAtPctile = n(),
    PctsAtPctile = CntsAtPctile / TravDistMi_Ntile_Rows
  ) %>% 
  mutate(CumSumPAtP = cumsum(PctsAtPctile)
        )

rm(TravDistMi_Ntile)
rm(TravDistMi_Ntile_Rows)

View(TravDistMi_Pctiles)
TravDistMi_Pctiles

```


Investigation of TravelDistance_Mi.

Why are some TravelDistance_Mi "NA"? It looks like partially because the records are the first trip of the day (for that bus), so I purposefully set the distance to "NA". Another reason is due to the odometer recording a value less than the previous odometer recording. In most cases, I have no explanation for this - though I have observed about 67% of all instances where TravelDistance_Mi is NA (other than because it's the first record of the day) are instances where DirChange2 is "Change". This is weird and should be asked to WMATA.
```{r}

# View(head(AllDays_NewOrder, 500))

View(filter(AllDays_NewOrder,
            BusDay_EventNum != 1 # When BusDay_EventNum == 1, TravelDistance_Mi is NA by design (don't want to calculate distance based on yesterday's position)
           ) %>% 
       group_by(StartStop_ID) %>% 
       summarise(Cnts = sum(is.na(TravelDistance_Mi)
                           )
                ) %>% 
       arrange(desc(Cnts)
              )
    )

View(filter(AllDays_NewOrder,
            StartStop_ID == "1000245--1000211"
           ) %>% 
       select(RowNum_OG,
              StartStop_ID,
              Event_Time,
              Event_Time_HrGroup,
              Bus_ID,
              TravelDistance_Mi,
              TravelDistance_Mi_Hvrs,
              TD_Mi_SS_Mean,
              TD_Mi_SS_Mean_F,
              TD_Mi_SSHG_Mean,
              TD_Mi_SSHG_Mean_F,
              TD_Mi_SS_Med,
              TD_Mi_SS_Med_F,
              TD_Mi_SSHG_Med,
              TD_Mi_SSHG_Med_F,
              TD_Mi_SS_Cnt,
              TD_Mi_SS_Cnt_F,
              TD_Mi_SSHG_Cnt,
              TD_Mi_SSHG_Cnt_F
              ) %>% 
       mutate(Ratio_MeanToHvrs = TD_Mi_SS_Mean / TravelDistance_Mi_Hvrs) %>% 
       arrange(Event_Time)
    )

View(filter(AllDays_NewOrder,
            is.na(TravelDistance_Mi)
           )
    )

# These records are NA becuase the record is the first record of the day (the Event_Time_Date)
View(filter(AllDays_NewOrder,
            between(RowNum_OG, 326, 346) | # 336
              between(RowNum_OG, 591, 611) | # 601
              between(RowNum_OG, 845, 865) # 855
           )
    )

```


Investigation of TravelDistance_Mi.

These records are NA becuase the current record odometer is less than the previous record odometer. Theoretically, this should NOT happen. Me: it appears that about 67% of all instances where TravelDistance_Mi is NA (other than because it's th first record of the day) are instances where DirChange2 is "Change". This is weird and should be asked to WMATA.
```{r}

View(filter(AllDays_NewOrder,
            between(RowNum_OG, 194, 214) | # 204
              between(RowNum_OG, 440, 460) | # 450
              between(RowNum_OG, 478, 498) | # 488
              between(RowNum_OG, 510, 530) # 520
           )
    )

TestTable <- filter(AllDays_NewOrder,
                    BusDay_EventNum != 1
                   ) %>% 
  mutate(TravelDistance_NA = as.factor(ifelse(is.na(TravelDistance_Mi),
                                              "True",
                                              "False"
                                             )
                                      )
        ) %>%
  group_by(DirChange2, TravelDistance_NA) %>%
  summarise(TravDistMi_NACnts = n()
           )

# TestTable

TestTable_Spread <- as.data.frame(spread(TestTable,
                                         TravelDistance_NA,
                                         TravDistMi_NACnts
                                        )
                                 ) %>% 
  select(False,
         True
        )

row.names(TestTable_Spread) <- c("Change", "Same")
# str(TestTable_Spread)
# TestTable_Spread

prop.table(as.table(as.matrix(TestTable_Spread)
                   ),
           1
          )

prop.table(as.table(as.matrix(TestTable_Spread)
                   ),
           2
          )

```


Investigation of TravelDistance_Mi.

Let's look at just the TravelDistance_Mi values that are NOT "NA".
```{r}

rm(TestTable, TestTable_Spread)

TravelDistance_Mi_NoNA <- filter(AllDays_NewOrder,
                                 # TravelDistance_Mi != 0 &
                                 !is.na(TravelDistance_Mi)
                                )

dim(AllDays_NewOrder)
dim(TravelDistance_Mi_NoNA)
nrow(AllDays_NewOrder) - nrow(TravelDistance_Mi_NoNA)

str(TravelDistance_Mi_NoNA)
summary(TravelDistance_Mi_NoNA)

```


Investigation of TravelDistance_Mi.

Let's plot just the TravelDistance_Mi values that are NOT "NA".
```{r}

TravDistMi_HistDen <- ggplot(select(TravelDistance_Mi_NoNA,
                                    TravelDistance_Mi
                                   ),
                             aes(x = TravelDistance_Mi,
                                 y = ..density..
                                )
                            ) +
  geom_histogram(binwidth = 0.05, fill = "lightblue", colour = "grey60", size = 0.2) +
  geom_line(stat = "density", colour = "red") +
  coord_cartesian(xlim = c(0, 1.5), ylim = c(0, 4.0)
                 ) +
  labs(title = "Variation in Distance Between Stops",
       x = "Travel Distance (miles)",
       y = "Density"
      )

TravDistMi_HistDen

```

Investigation of TravelDistance_Mi.

Looking at the extremely large TravelDistance_Mi values. Some (aprox 27%) of TravelDistance_Mi values > 1 mile are when the DirChange2 changes...but what about the other ~73%?
```{r}

rm(TravelDistance_Mi_NoNA)

# examples of weirdly large TravelDistance_Mi
View(filter(AllDays_NewOrder,
            TravelDistance_Mi > 1.1587121212 # 1.1587121212 is the 99th percentile
           ) %>% 
       arrange(desc(TravelDistance_Mi)
              )
    )


# Why are these extremes?  Airports?  Bus collection points?
View(filter(AllDays_NewOrder,
              between(RowNum_OG, 494044, 494064) | # 494054
              between(RowNum_OG, 494273, 494293) | # 494283
              between(RowNum_OG, 494626, 494646) | # 494636
              between(RowNum_OG, 1610156, 1610176) | # 1610166
              between(RowNum_OG, 2073074, 2073094) # 2073084
           )
    )

# Before Removing Runs
# View(filter(AllDays_Sorted,
#             between(RowNum_OG, 494044, 494064) | # 494054
#               between(RowNum_OG, 494273, 494293) | # 494283
#               between(RowNum_OG, 494626, 494646) | # 494636
#               between(RowNum_OG, 1610156, 1610176) | # 1610166
#               between(RowNum_OG, 2073074, 2073094) # 2073084
#            )
#     )

# After Removing Runs
# View(filter(AllDays_FirstStopID,
#             between(RowNum_OG, 494044, 494064) | # 494054
#               between(RowNum_OG, 494273, 494293) | # 494283
#               between(RowNum_OG, 494626, 494646) | # 494636
#               between(RowNum_OG, 1610156, 1610176) | # 1610166
#               between(RowNum_OG, 2073074, 2073094) # 2073084
#            )
#     )

```


Investigation of TravelDistance_Mi.

Any relation with DirChange2?  Doesn't look as if this is so.
```{r}

ExtremeTravDist <- filter(AllDays_NewOrder,
                          !is.na(TravelDistance_Mi)
                         ) %>% 
  mutate(TravDist_Extreme = ifelse(TravelDistance_Mi > 1.1587121212, # 1.1587121212 is the 99th percentile
                                   "True",
                                   "False"
                                  )
                          ) %>% 
  group_by(DirChange2, TravDist_Extreme) %>% 
  summarise(TravDistMI_ExtCnts = n()
           )

# ExtremeTravDist


ExtremeTravDist_Spread <- as.data.frame(spread(ExtremeTravDist,
                                               TravDist_Extreme,
                                               TravDistMI_ExtCnts
                                              )
                                       ) %>% 
  select(False,
         True
        )

row.names(ExtremeTravDist_Spread) <- c("Change", "Same")
# str(ExtremeTravDist_Spread)
# ExtremeTravDist_Spread

prop.table(as.table(as.matrix(ExtremeTravDist_Spread)
                   ),
           1
          )

prop.table(as.table(as.matrix(ExtremeTravDist_Spread)
                   ),
           2
          )

```


Investigation of TravelDistance_Mi.

Looking at specific buses and StartStop_ID.
```{r}

rm(ExtremeTravDist, ExtremeTravDist_Spread)

View(arrange(group_by(AllDays_NewOrder,
                      Bus_ID
                     ) %>% 
               summarise(DistTrav_Mean = mean(TravelDistance_Mi, na.rm = TRUE),
                         DistTrav_Med = median(TravelDistance_Mi, na.rm = TRUE)
                        ),
             desc(DistTrav_Med)
            )
    )


# example of extremely small TravelDistance_Mi values (looks like the odometer wasn't functioning)
View(filter(AllDays_NewOrder,
            Bus_ID == 6111 |
              Bus_ID == 7201 |
              Bus_ID == 8058
           ) %>% 
       arrange(Bus_ID, Event_Time)
    )


View(arrange(group_by(AllDays_NewOrder,
                      StartStop_ID
                     ) %>% 
               summarise(DistTrav_Mean = mean(TravelDistance_Mi, na.rm = TRUE),
                         DistTrav_Med = median(TravelDistance_Mi, na.rm = TRUE)
                        ),
             desc(DistTrav_Med)
            )
    )

# example of extremely large TravelDistance_Mi values...no idea why...
View(filter(AllDays_NewOrder,
            StartStop_ID == "1003665--12" |
              StartStop_ID == "1003665--5001925" |
              StartStop_ID == "3001038--3002565"
           ) %>% 
       arrange(StartStop_ID, Event_Time)
    )

```


Investigation of TravelDistance_Mi & TravelDistance_Mi_New.

If TravelDisntace_Mi is below the 5th percentile for that StartStop_ID, or if TravelDisntace_Mi is above the 95th percentile for that StartStop_ID, or if TravelDistance_Mi is NA (when the BusDay_EventNum !=1), consider this an outlier.  In this case, replace the value with the mean for that StartStop_ID and HourGroup (TD_Mi_SSHG_Mean_F), or if there are not enough values at the HourGroup level, replace it with the mean for that StartStop_ID.
```{r}

# View(tail(AllDays_NewOrder, 500))

AllDays_NewTravelDist <- 
  mutate(AllDays_NewOrder,
         TravelDistance_Mi_New = ifelse(!is.na(TravelDistance_Mi) & 
                                          (TravelDistance_Mi < TD_Mi_SSHG_q5 |
                                             TravelDistance_Mi > TD_Mi_SSHG_q95
                                          ) &
                                          TD_Mi_SSHG_Cnt_F >= 20,
                                        TD_Mi_SSHG_Mean_F,
                                 ifelse(!is.na(TravelDistance_Mi) & 
                                          (TravelDistance_Mi < TD_Mi_SSHG_q5 |
                                             TravelDistance_Mi > TD_Mi_SSHG_q95
                                          ) &
                                          TD_Mi_SSHG_Cnt_F < 20 &
                                          TD_Mi_SS_Cnt_F >= 20,
                                        TD_Mi_SS_Mean_F,
                                 ifelse(!is.na(TravelDistance_Mi) & 
                                          (TravelDistance_Mi < TD_Mi_SSHG_q5 |
                                             TravelDistance_Mi > TD_Mi_SSHG_q95
                                          ) &
                                          TD_Mi_SS_Cnt_F < 20 &
                                          TD_Mi_SS_Cnt >= 20,
                                        TD_Mi_SS_Mean,
                                 ifelse(is.na(TravelDistance_Mi) &
                                          BusDay_EventNum != 1 &
                                          TravelDistance_Mi_Hvrs != 0,
                                        TravelDistance_Mi_Hvrs,
                                 ifelse(is.na(TravelDistance_Mi) &
                                          BusDay_EventNum != 1 &
                                          TravelDistance_Mi_Hvrs == 0,
                                        TD_Mi_SS_Mean,
                                        TravelDistance_Mi
                                       ))))),
         TravelDistance_Mi_New_Label = 
           factor(ifelse(!is.na(TravelDistance_Mi) &
                           (TravelDistance_Mi < TD_Mi_SSHG_q5 |
                              TravelDistance_Mi > TD_Mi_SSHG_q95
                           ) &
                           TD_Mi_SSHG_Cnt_F >= 20,
                         "TD_Mi_SSHG_Mean_F",
                  ifelse(!is.na(TravelDistance_Mi) &
                           (TravelDistance_Mi < TD_Mi_SSHG_q5 |
                              TravelDistance_Mi > TD_Mi_SSHG_q95
                           ) &
                           TD_Mi_SSHG_Cnt_F < 20 &
                           TD_Mi_SS_Cnt_F >= 20,
                         "TD_Mi_SS_Mean_F",
                  ifelse(!is.na(TravelDistance_Mi) &
                           (TravelDistance_Mi < TD_Mi_SSHG_q5 |
                              TravelDistance_Mi > TD_Mi_SSHG_q95
                           ) &
                           TD_Mi_SS_Cnt_F < 20 &
                           TD_Mi_SS_Cnt >= 20,
                         "TD_Mi_SS_Mean",
                  ifelse(is.na(TravelDistance_Mi) &
                           BusDay_EventNum != 1 &
                           TravelDistance_Mi_Hvrs != 0,
                         "TravelDistance_Mi_Hvrs",
                  ifelse(is.na(TravelDistance_Mi) &
                           BusDay_EventNum != 1 &
                           TravelDistance_Mi_Hvrs == 0,
                         "TD_Mi_SS_Mean",
                         "TravelDistance_Mi"
                        )))))
                 ),
         TravelDistance_Mi_NewHvrs = ifelse(!is.na(TravelDistance_Mi_Hvrs) &
                                              TravelDistance_Mi_Hvrs != 0 &
                                              (TravelDistance_Mi_New < TD_Mi_q2 |
                                                 TravelDistance_Mi_New > TD_Mi_q98
                                              ),
                                            TravelDistance_Mi_Hvrs,
                                            TravelDistance_Mi_New
                                           ),
         TravelDistance_Mi_NewHvrs_Label =
           factor(ifelse(!is.na(TravelDistance_Mi_Hvrs) &
                           TravelDistance_Mi_Hvrs != 0 &
                           (TravelDistance_Mi_New < TD_Mi_q2 |
                              TravelDistance_Mi_New > TD_Mi_q98
                           ),
                         "TravelDistance_Mi_Hvrs",
                         as.character(TravelDistance_Mi_New_Label)
                        )
                 ),
         SpeedAvg_Mph_NewHvrs = TravelDistance_Mi_NewHvrs / TravelTime_Hr
        )

str(AllDays_NewTravelDist)

```


Investigation of TravelDistance_Mi & TravelDistance_Mi_Hvrs & TravelDistance_Mi_New.

Quick summary and then correlation calculation.
```{r}

rm(AllDays_NewOrder)


# 38 rows meet this criteria anymore  --  appears to be the case when both the Lat Long calculations, and the TravelDistance calculations did not function properly.
View(filter(AllDays_NewTravelDist,
            is.na(TravelDistance_Mi_New) &
              BusDay_EventNum != 1
           )
    )

View(AllDays_NewTravelDist %>% 
       arrange(desc(TravelDistance_Mi_New)) %>% 
       head(500)
    )

summary(select(AllDays_NewTravelDist,
               TravelDistance_Mi,
               TravelDistance_Mi_Hvrs,
               TravelDistance_Mi_New,
               TravelDistance_Mi_NewHvrs
              )
       )

summary(select(filter(AllDays_NewTravelDist,
                      BusDay_EventNum != 1
                     ),
               TravelDistance_Mi,
               TravelDistance_Mi_Hvrs,
               TravelDistance_Mi_New,
               TravelDistance_Mi_NewHvrs
              )
       )


cor(select(AllDays_NewTravelDist,
           TravelDistance_Mi,
           TravelDistance_Mi_Hvrs,
           TravelDistance_Mi_New,
           TravelDistance_Mi_NewHvrs
          ),
    use = "pairwise.complete.obs"
  )

```


Investigation of TravelDistance_Mi_NewHvrs_Label & TravelDistance_Mi_NewHvrs_Label.

Show how the labels changed.
```{r}

group_by(AllDays_NewTravelDist,
         TravelDistance_Mi_New_Label,
         TravelDistance_Mi_NewHvrs_Label
        ) %>% 
  summarise(CntNum = n(),
            CntPct = format(CntNum / nrow(AllDays_NewTravelDist),
                            scientific = 9999
                           )
           ) %>% 
  arrange(desc(CntPct)
         )

```


Investigation of TravelDistance_Mi & TravelDistance_Mi_Hvrs & TravelDistance_Mi_New.

Graphing the two methods of calculating TravelDistance_Mi.

First, let's get create a function to plot the liner model equation.
```{r}

lm_eqn <- function(df, y, x){
  m <- lm(y ~ x, df)
  
  l <- list(a = format(coef(m)[1], digits = 2),
            b = format(abs(coef(m)[2]), digits = 2),
            s1 = ifelse(test = coef(m)[2] > 0,
                        yes = "+",
                        no = "-"
                       ),
            r2 = format(summary(m)$r.squared,
                        digits = 3
                       )
           )
  
  eq <- substitute(italic(y) == a~~s1~~b %.% italic(x)*","~~italic(r)^2~"="~r2,
                   l
                  )
  
  as.character(as.expression(eq)
              )             
}

```


Investigation of TravelDistance_Mi & TravelDistance_Mi_NewHvrs.

Scatter plot (using a 10% sample to making plotting time faster and to reduce un-needed data in the "same" splot).
```{r}

set.seed(123456789)
AllDays_NewTravelDist_10Pct <- filter(AllDays_NewTravelDist,
                                      !is.na(TravelDistance_Mi_NewHvrs) &
                                        !is.na(TravelDistance_Mi)
                                     ) %>% 
  rename(DistMethod = TravelDistance_Mi_NewHvrs_Label) %>% 
  sample_frac(0.1)


TravDist_MiVsCalc <- ggplot(select(AllDays_NewTravelDist_10Pct,
                                   TravelDistance_Mi_NewHvrs,
                                   TravelDistance_Mi,
                                   DistMethod
                                  ),
                            aes(x = TravelDistance_Mi,
                                y = TravelDistance_Mi_NewHvrs,
                                colour = DistMethod
                               )
                           ) +
  scale_colour_manual(values = c("red","blue", "green", "orange", "black")
                     ) +
  geom_point(shape = 1, alpha = 0.5) +
  scale_shape(solid = FALSE) +
  geom_smooth(method = "lm", colour = "blue") +
  geom_abline(intercept = 0, slope = 1, colour = "red") +
  coord_cartesian(xlim = c(0, 1.5), ylim = c(0, 1.5)
                 ) +
  scale_x_continuous(breaks = seq(0, 1.5, 0.25)
                    ) +
  scale_y_continuous(breaks = seq(0, 1.5, 0.25)
                    ) +
  theme(legend.position = "bottom", #c(0.85, 0.40),
        legend.text = element_text(size = 6)
       ) +
  annotate(label = lm_eqn(df = AllDays_NewTravelDist_10Pct,
                          x = AllDays_NewTravelDist_10Pct$TravelDistance_Mi,
                          y = AllDays_NewTravelDist_10Pct$TravelDistance_Mi_NewHvrs
                         ),
           # x = 62,
           # y = 20,
           x = 0.70,
           y = 0.00,
           geom = "text",
           size = 3,
           colour = "blue",
           parse = TRUE
          ) +
  annotate(label = "Reference Line (slope = 1)",
           # x = 16,
           # y = 30,
           x = 0.80,
           y = 1.05,
           geom = "text",
           size = 3,
           colour = "red"
          ) +
  labs(title = "TravelDistance_Mi vs. TravelDistance_Mi_NewHvrs",
       x = "TravelDistance_Mi",
       y = "TravelDistance_Mi_NewHvrs"
      )
# +
#   geom_jitter()

TravDist_MiVsCalc

```


Investigation of TravelDistance_Mi & TravelDistance_Mi_Hvrs & TravelDistance_Mi_New.

Graphing test with rbokeh.
```{r}

TravDist_MiVsCalc_Bokeh <- figure(data = select(AllDays_NewTravelDist_10Pct,
                                                TravelDistance_Mi_NewHvrs,
                                                TravelDistance_Mi,
                                                DistMethod
                                               ),
                                  xlim = c(0, 1.5),
                                  ylim = c(0, 1.5),
                                  legend_location = "bottom_right"
                                 ) %>% 
  ly_points(x = TravelDistance_Mi,
            y = TravelDistance_Mi_NewHvrs,
            color = DistMethod,
            hover = c(TravelDistance_Mi_NewHvrs, TravelDistance_Mi, DistMethod)
           ) %>% 
  ly_abline(a = 0, b = 1, color = "red")

TravDist_MiVsCalc_Bokeh

```


Investigation of TravelDistance_Mi_New.

Calculating the minimum TravelDistance_Mi_New value at each percentile.
```{r}

rm(TravDist_MiVsCalc_Bokeh)
rm(AllDays_NewTravelDist_10Pct)


summary(select(AllDays_NewTravelDist,
               TravelDistance_Mi,
               TravelDistance_Mi_Hvrs,
               TravelDistance_Mi_New,
               TravelDistance_Mi_NewHvrs
              )
       )

summary(select(filter(AllDays_NewTravelDist,
                      BusDay_EventNum != 1
                     ),
               TravelDistance_Mi,
               TravelDistance_Mi_Hvrs,
               TravelDistance_Mi_New,
               TravelDistance_Mi_NewHvrs
              )
       )


TravDistMiN_Ntile <- as.data.frame(select(AllDays_NewTravelDist,
                                          StartStop_ID,
                                          TravelDistance_Mi_New_Label,
                                          # TravelDistance_Mi_NewHvrs_Label,
                                          TravelDistance_Mi_New
                                          # TravelDistance_Mi_NewHvrs
                                         )
                                  ) %>% 
  mutate(PctR_N = percent_rank(AllDays_NewTravelDist$TravelDistance_Mi_New),
         # PctR_H = percent_rank(AllDays_NewTravelDist$TravelDistance_Mi_NewHvrs),
         PctR_Round_N = round(PctR_N, 2)
         # PctR_Round_H = round(PctR_H, 2)
        ) 

# str(TravDistMiN_Ntile)
# View(head(TravDistMiN_Ntile, 500))

TravDistMiN_Ntile_Rows <- nrow(TravDistMiN_Ntile)

# View(tail(TravDistMiN_Ntile, 500))


TravDistMiN_Pctiles <- group_by(TravDistMiN_Ntile,
                                PctR_Round_N
                               ) %>% 
  summarise(
    MinTDMiAtPctile_N = min(TravelDistance_Mi_New),
    # MinTDMiAtPctile_H = min(TravelDistance_Mi_NewHvrs),
    CntsAtPctile_N = sum(!is.na(TravelDistance_Mi_New)),
    # CntsAtPctile_H = sum(!is.na(TravelDistance_Mi_NewHvrs)),
    PctsAtPctile_N = CntsAtPctile_N / TravDistMiN_Ntile_Rows
    # PctsAtPctile_H = CntsAtPctile_H / TravDistMiN_Ntile_Rows
  ) %>% 
  mutate(CumSumPAtP_N = cumsum(PctsAtPctile_N)
         # CumSumPAtP_H = cumsum(PctsAtPctile_H)
        )

# View(TravDistMiN_Pctiles)

```


Investigation of TravelDistance_Mi_NewHvrs

Calculating the minimum TravelDistance_Mi_NewHvrs value at each percentile.
```{r}

TravDistMiH_Ntile <- as.data.frame(select(AllDays_NewTravelDist,
                                          StartStop_ID,
                                          # TravelDistance_Mi_New_Label,
                                          TravelDistance_Mi_NewHvrs_Label,
                                          # TravelDistance_Mi_New,
                                          TravelDistance_Mi_NewHvrs
                                         )
                                  ) %>% 
  mutate(# PctR_N = percent_rank(AllDays_NewTravelDist$TravelDistance_Mi_New),
         PctR_H = percent_rank(AllDays_NewTravelDist$TravelDistance_Mi_NewHvrs),
         # PctR_Round_N = round(PctR_N, 2),
         PctR_Round_H = round(PctR_H, 2)
        ) 

# str(TravDistMiH_Ntile)
# View(head(TravDistMiH_Ntile, 500))

TravDistMiH_Ntile_Rows <- nrow(TravDistMiH_Ntile)

# View(tail(TravDistMiH_Ntile, 500))


TravDistMiH_Pctiles <- group_by(TravDistMiH_Ntile,
                                PctR_Round_H
                               ) %>% 
  summarise(
    # MinTDMiAtPctile_N = min(TravelDistance_Mi_New),
    MinTDMiAtPctile_H = min(TravelDistance_Mi_NewHvrs),
    # CntsAtPctile_N = sum(!is.na(TravelDistance_Mi_New)),
    CntsAtPctile_H = sum(!is.na(TravelDistance_Mi_NewHvrs)),
    # PctsAtPctile_N = CntsAtPctile_N / TravDistMiH_Ntile_Rows,
    PctsAtPctile_H = CntsAtPctile_H / TravDistMiH_Ntile_Rows
  ) %>% 
  mutate(# CumSumPAtP_N = cumsum(PctsAtPctile_N),
         CumSumPAtP_H = cumsum(PctsAtPctile_H)
        )

# View(TravDistMiH_Pctiles)

```


Join TravDistMiH_Pctiles, TravDistMiN_Pctiles, and TravDistMi_Pctiles.

~11% of rides are still showing as less than 0.1 miles of TravelDistance_Mi_NewHvrs.
```{r}

rm(TravDistMiN_Ntile_Rows, TravDistMiH_Ntile_Rows, TravDistMiN_Ntile, TravDistMiH_Ntile)


# View(TravDistMi_Pctiles)
# View(TravDistMiN_Pctiles)
# View(TravDistMiH_Pctiles)

TravDistMi_Pctiles_All <- inner_join(x = TravDistMi_Pctiles,
                                     y = TravDistMiN_Pctiles,
                                     by = c("PctR_Round" = "PctR_Round_N")
                                    ) %>% 
  inner_join(y = TravDistMiH_Pctiles,
             by = c("PctR_Round" = "PctR_Round_H")
            ) %>% 
  select(PctR_Round,
         MinTravDistMiAtPctile,
         MinTDMiAtPctile_N,
         MinTDMiAtPctile_H,
         CntsAtPctile,
         CntsAtPctile_N,
         CntsAtPctile_H,
         PctsAtPctile,
         PctsAtPctile_N,
         PctsAtPctile_H,
         CumSumPAtP,
         CumSumPAtP_N,
         CumSumPAtP_H
         )

# str(TravDistMi_Pctiles_All)

rm(TravDistMi_Pctiles, TravDistMiN_Pctiles,TravDistMiH_Pctiles)


View(TravDistMi_Pctiles_All)
TravDistMi_Pctiles_All

```


Investigation of TravelDistance_Mi_New.

Why are there still some small or large TravelDistance_Mi_NewHvrs values.
```{r}

# View(filter(AllDays_NewTravelDist,
#             !is.na(TravelDistance_Mi_NewHvrs)
#            ) %>% 
#        select(-TD_Mi_q2,
#               -TD_Mi_q98,
#               -TD_Mi_SS_q5,
#               -TD_Mi_SS_q95,
#               -TD_Mi_SSHG_q5,
#               -TD_Mi_SSHG_q95,
#               -TD_Mi_Mean,
#               -TD_Mi_Mean_F,
#               -TD_Mi_SS_Mean,
#               -TD_Mi_SS_Mean_F,
#               -TD_Mi_SSHG_Mean,
#               -TD_Mi_SSHG_Mean_F,
#               -TD_Mi_Med,
#               -TD_Mi_Med_F,
#               -TD_Mi_SS_Med,
#               -TD_Mi_SS_Med_F,
#               -TD_Mi_SSHG_Med,
#               -TD_Mi_SSHG_Med_F,
#               -TD_Mi_Cnt,
#               -TD_Mi_Cnt_F,
#               -TD_Mi_SS_Cnt,
#               -TD_Mi_SS_Cnt_F,
#               -TD_Mi_SSHG_Cnt,
#               -TD_Mi_SSHG_Cnt_F,
#               -TT_Sec_q2,
#               -TT_Sec_q98,
#               -TT_Sec_SS_q5,
#               -TT_Sec_SS_q95,
#               -TT_Sec_SSHG_q5,
#               -TT_Sec_SSHG_q95,
#               -TT_Sec_Mean,
#               -TT_Sec_Mean_F,
#               -TT_Sec_SS_Mean,
#               -TT_Sec_SS_Mean_F,
#               -TT_Sec_SSHG_Mean,
#               -TT_Sec_SSHG_Mean_F,
#               -TT_Sec_Med,
#               -TT_Sec_Med_F,
#               -TT_Sec_SS_Med,
#               -TT_Sec_SS_Med_F,
#               -TT_Sec_SSHG_Med,
#               -TT_Sec_SSHG_Med_F,
#               -TT_Sec_Cnt,
#               -TT_Sec_Cnt_F,
#               -TT_Sec_SS_Cnt,
#               -TT_Sec_SS_Cnt_F,
#               -TT_Sec_SSHG_Cnt,
#               -TT_Sec_SSHG_Cnt_F,
#               -TT_Hr_q2,
#               -TT_Hr_q98,
#               -TT_Hr_SS_q5,
#               -TT_Hr_SS_q95,
#               -TT_Hr_SSHG_q5,
#               -TT_Hr_SSHG_q95,
#               -TT_Hr_Mean,
#               -TT_Hr_Mean_F,
#               -TT_Hr_SS_Mean,
#               -TT_Hr_SS_Mean_F,
#               -TT_Hr_SSHG_Mean,
#               -TT_Hr_SSHG_Mean_F,
#               -TT_Hr_Med,
#               -TT_Hr_Med_F,
#               -TT_Hr_SS_Med,
#               -TT_Hr_SS_Med_F,
#               -TT_Hr_SSHG_Med,
#               -TT_Hr_SSHG_Med_F,
#               -TT_Hr_Cnt,
#               -TT_Hr_Cnt_F,
#               -TT_Hr_SS_Cnt,
#               -TT_Hr_SS_Cnt_F,
#               -TT_Hr_SSHG_Cnt,
#               -TT_Hr_SSHG_Cnt_F
#              ) %>% 
#        arrange(TravelDistance_Mi_NewHvrs) %>% 
#        head(500)
#     )

View(filter(AllDays_NewTravelDist,
            !is.na(TravelDistance_Mi_NewHvrs)
           ) %>% 
       select(-matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
             ) %>% 
       arrange(TravelDistance_Mi_NewHvrs) %>%
       head(500)
    )

# examples of the smallest TravelDistance_Mi_NewHvrs values.
View(filter(AllDays_NewTravelDist,
            (RowNum_OG >= 1424440 & RowNum_OG <= 1424460) | # 1424450  --  direction change
                (RowNum_OG >= 763292 & RowNum_OG <= 763312) | # 763302  --  direction change
                (RowNum_OG >= 1679093 & RowNum_OG <= 1679113) | # 1679103  --  direction change
                (RowNum_OG >= 2860918 & RowNum_OG <= 2860938) # 2860928  --  looks correct
           ) %>% 
       select(-matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
             )
    )


View(filter(AllDays_NewTravelDist,
            !is.na(TravelDistance_Mi_NewHvrs)
           ) %>% 
       select(-matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
             ) %>% 
       arrange(desc(TravelDistance_Mi_NewHvrs)
              ) %>%
       head(500)
    )

# examples of the largest TravelDistance_Mi_NewHvrs values.
View(filter(AllDays_NewTravelDist,
            (RowNum_OG >= 1092000 & RowNum_OG <= 1092050) | # 1092030  --  direction change
                (RowNum_OG >= 1609460 & RowNum_OG <= 1609480) | # 1609470  -- direction change 
                (RowNum_OG >= 508904 & RowNum_OG <= 508924) | # 508914  --  direction change & original StopID was bad
                (RowNum_OG >= 2476345 & RowNum_OG <= 2476365) # 2476355  --  direction change
           ) %>% 
       select(-matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
             )
    )

```


Investigation of TravelTime_Hr.

View(TravDistMi_Pctiles): 98% of TravelTime_Hr are between 7 seconds and 464 seconds (~8 minutes).
```{r}

TravTimeHr_Ntile <- select(AllDays_NewTravelDist,
                           TravelTime_Hr
                          ) %>% 
  mutate(# Pctile = ntile(AllDays_NewTravelDist$TravelTime_Hr, 100),
         # MinR = min_rank(AllDays_NewTravelDist$TravelTime_Hr),
         PctR = percent_rank(AllDays_NewTravelDist$TravelTime_Hr),
         PctR_Round = round(PctR, 2)
        ) 

# str(TravTimeHr_Ntile)

TravTimeHr_Ntile_Rows <- nrow(TravTimeHr_Ntile)

# View(tail(TravTimeHr_Ntile, 500))


TravTimeHr_Pctiles <- group_by(TravTimeHr_Ntile,
                               PctR_Round
                              ) %>% 
  summarise(
    MinTravTimeHrAtPctile = min(TravelTime_Hr),
    CntsAtPctile = n(),
    PctsAtPctile = CntsAtPctile / TravTimeHr_Ntile_Rows
  ) %>% 
  mutate(CumSumPAtP = cumsum(PctsAtPctile),
         MinTravTimeSecAtPctile = MinTravTimeHrAtPctile * 3600
        )

rm(TravTimeHr_Ntile_Rows)
rm(TravTimeHr_Ntile)
View(TravTimeHr_Pctiles)
TravTimeHr_Pctiles

```


Investigation of TravelTime_Hr.

Histogram of TravelTime_Sec.
```{r}

TravTime_Sec_HistDen <- ggplot(filter(select(AllDays_NewTravelDist,
                                             TravelTime_Sec
                                            ),
                                      !is.na(TravelTime_Sec)
                                     ),
                               aes(x = TravelTime_Sec,
                                   y = ..density..
                                  )
                          ) +
  geom_histogram(binwidth = 5, fill = "lightblue", colour = "grey60", size = 0.2) +
  geom_line(stat = "density", colour = "red") +
  # stat_bin(binwidth = 5,
  #          geom = "text",
  #          size = 2.5,
  #          vjust = 1.5,
  #          aes(label = format(..count.., big.mark = ",")
  #             ),
  #         ) +
  coord_cartesian(xlim = c(0, 180), ylim = c(0, 0.02)
                 ) +
  #  theme(legend.position="none") +
  labs(title = "Variation in Travel Time",
       x = "Travel Time (sec)",
       y = "Density"
      )

TravTime_Sec_HistDen

```


Investigation of TravelTime_Sec.

TravelTime_Sec values are NA.
```{r}

summary(AllDays_NewTravelDist$TravelTime_Sec)


View(select(AllDays_NewTravelDist,
            -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
           ) %>% 
       filter(is.na(TravelTime_Sec) &
                BusDay_EventNum != 1  # TravelTime purposefully not calculated here
             )
    )

# examples of TravelTime_Sec values that are NA. These are NA because the Event_Time & Departure_Time readings are not accurate (i.e., the previous Departure_Time is BEFORE or EQUAL TO the current Event_Time).
View(filter(AllDays_NewTravelDist,
            (RowNum_OG >= 90809 & RowNum_OG <= 90829) | # 90819
                (RowNum_OG >= 90881 & RowNum_OG <= 90901) | # 90891
                (RowNum_OG >= 2597066 & RowNum_OG <= 2597086) | # 2597076
                (RowNum_OG >= 2613305 & RowNum_OG <= 2613325) # 2613315
           ) %>% 
       select(-matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt"))
    )

```


Investigation of TravelTime_Sec.

TravelTime_Sec values are extremely small.
```{r}

View(select(AllDays_NewTravelDist,
            -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
           ) %>% 
       filter(!is.na(TravelTime_Sec)
             ) %>% 
       arrange(TravelTime_Sec,
               desc(SpeedAvg_Mph_NewHvrs)
              ) %>%
       head(500)
    )

# examples where TravelTime_Sec is small (1 sec) and SpeedAvg_Mph_NewHvrs is large.
View(select(AllDays_NewTravelDist,
            -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
           ) %>% 
       filter((RowNum_OG >= 2217353 & RowNum_OG <= 2217373) | # 2217363
                (RowNum_OG >= 3090321 & RowNum_OG <= 3090341) | # 3090331
                (RowNum_OG >= 80764 & RowNum_OG <= 80784) | # 80774
                (RowNum_OG >= 33840 & RowNum_OG <= 33860) # 33850
           )
    )

```


Investigation of TravelTime_Sec.

TravelTime_Sec values are extremely large.
```{r}

View(select(AllDays_NewTravelDist,
            -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
           ) %>% 
       filter(!is.na(TravelTime_Sec)
             ) %>% 
       arrange(desc(TravelTime_Sec),
               SpeedAvg_Mph_NewHvrs
              ) %>%
       head(500)
    )

# examples where TravelTime_Sec is large and SpeedAvg_Mph_NewHvrs is small.
View(select(AllDays_NewTravelDist,
            -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
           ) %>% 
       filter((RowNum_OG >= 1007703 & RowNum_OG <= 1007723) | # 1007713
                (RowNum_OG >= 2373564 & RowNum_OG <= 2373584) | # 2373574
                (RowNum_OG >= 864379 & RowNum_OG <= 864399) | # 864389
                (RowNum_OG >= 2570060 & RowNum_OG <= 2570080) # 2570070
           )
    )


```


Investigation of TravelTime_Sec.

Are large TravelTime_Sec values related to RouteChanges? Looks likely. When the Bus involves a Route "change", there is almost twice as likely to be a case of an outlier TravelTime_Sec value (on the high side).
```{r}

TTLargeRteChng <- select(AllDays_NewTravelDist,
                         -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
                        ) %>% 
  mutate(TT_Out = factor(ifelse(TravelTime_Sec > 464,  # this is the 99th percentile
                                "Outlier",
                                "Normal"
                               )
                        )
        )

# str(TTLargeRteChng)


TTLargeRteChng_Cnts <- group_by(TTLargeRteChng,
                                RteChange2,
                                TT_Out
                               ) %>% 
  summarise(Cnts = n()
           )

TTLargeRteChng_Spread <- as.data.frame(spread(TTLargeRteChng_Cnts,
                                              TT_Out,
                                              Cnts
                                             )
                                      ) %>%
  select(-RteChange2)

row.names(TTLargeRteChng_Spread) <- c("Change", "Same")
# str(TTLargeRteChng_Spread)


# When the Bus involves a Route "change", there is almost twice as likely to be a case of an outlier TravelTime_Sec value.
TTLargeRteChng_Spread
prop.table(as.table(as.matrix(TTLargeRteChng_Spread)
                   ),
           1
          )

prop.table(as.table(as.matrix(TTLargeRteChng_Spread)
                   ),
           2
          )

# rm(TTLargeRteChng, TTLargeRteChng_Spread)
         

View(filter(TTLargeRteChng,
            !is.na(TravelTime_Sec) &
              RteChange2 == "Same"
           ) %>% 
       arrange(desc(TravelTime_Sec),
               SpeedAvg_Mph_NewHvrs
              ) %>%
       head(500)
    )


# examples where TravelTime_Sec is large and SpeedAvg_Mph_NewHvrs is small.
View(filter(TTLargeRteChng,
            (RowNum_OG >= 2250290 & RowNum_OG <= 2250310) | # 2250300
              (RowNum_OG >= 867717 & RowNum_OG <= 867737) | # 867727
              (RowNum_OG >= 864379 & RowNum_OG <= 864399) | # 864389
              (RowNum_OG >= 808395 & RowNum_OG <= 808415) # 808405
           )
    )

```


Investigation of TravelTime_Sec.

If TravelTime_Sec is below the 5th percentile for that StartStop_ID, or if TravelTime_Sec is above the 95th percentile for that StartStop_ID,  consider this an outlier.  In this case, replace the value with the mean for that StartStop_ID and HourGroup (TT_Sec_SSHG_Mean_F), or if there are not enough values at the HourGroup level, replace it with the mean for that StartStop_ID.
```{r}

rm(TTLargeRteChng, TTLargeRteChng_Cnts, TTLargeRteChng_Spread)


NewTravTime <- mutate(AllDays_NewTravelDist,
                      TT_Sec_New = ifelse(!is.na(TravelTime_Sec) &
                                            (TravelTime_Sec < TT_Sec_SSHG_q5 |
                                               TravelTime_Sec > TT_Sec_SSHG_q95
                                            ) &
                                            TT_Sec_SSHG_Cnt_F >= 20,
                                          TT_Sec_SSHG_Mean_F,
                                   ifelse(!is.na(TravelTime_Sec) &
                                            (TravelTime_Sec < TT_Sec_SSHG_q5 |
                                               TravelTime_Sec > TT_Sec_SSHG_q95
                                            ) &
                                            TT_Sec_SSHG_Cnt_F < 20 &
                                            TT_Sec_SS_Cnt_F >= 20,
                                          TT_Sec_SS_Mean_F,
                                   ifelse(!is.na(TravelTime_Sec) &
                                            (TravelTime_Sec < TT_Sec_SSHG_q5 |
                                               TravelTime_Sec > TT_Sec_SSHG_q95
                                            ) &
                                            TT_Sec_SS_Cnt_F < 20 &
                                            TT_Sec_SS_Cnt >= 20,
                                          TT_Sec_SS_Mean,
                                   ifelse(!is.na(TravelTime_Sec) &
                                            (TravelTime_Sec < TT_Sec_SSHG_q5 |
                                               TravelTime_Sec > TT_Sec_SSHG_q95
                                            ) &
                                            TT_Sec_SS_Cnt_F < 20 &
                                            TT_Sec_SS_Cnt < 20 &
                                            RteChange2 == "Change",
                                          NA,
                                          TravelTime_Sec
                                         )))),
                      
                      TT_Sec_New_Label = 
           factor(ifelse(!is.na(TravelTime_Sec) &
                           (TravelTime_Sec < TT_Sec_SSHG_q5 |
                              TravelTime_Sec > TT_Sec_SSHG_q95
                           ) &
                           TT_Sec_SSHG_Cnt_F >= 20,
                         "TT_Sec_SSHG_Mean_F",
                  ifelse(!is.na(TravelTime_Sec) &
                           (TravelTime_Sec < TT_Sec_SSHG_q5 |
                              TravelTime_Sec > TT_Sec_SSHG_q95
                           ) &
                           TT_Sec_SSHG_Cnt_F < 20 &
                           TT_Sec_SS_Cnt_F >= 20,
                         "TT_Sec_SS_Mean_F",
                  ifelse(!is.na(TravelTime_Sec) &
                           (TravelTime_Sec < TT_Sec_SSHG_q5 |
                              TravelTime_Sec > TT_Sec_SSHG_q95
                            ) &
                           TT_Sec_SS_Cnt_F < 20 &
                           TT_Sec_SS_Cnt >= 20,
                         "TT_Sec_SS_Mean",
                  ifelse(!is.na(TravelTime_Sec) &
                           (TravelTime_Sec < TT_Sec_SSHG_q5 |
                              TravelTime_Sec > TT_Sec_SSHG_q95
                           ) &
                           TT_Sec_SS_Cnt_F < 20 &
                           TT_Sec_SS_Cnt < 20 &
                           RteChange2 == "Change",
                         NA,
                         "TravelTime_Sec"
                        ))))
                 ),
                  
                  TT_Hr_New = TT_Sec_New / (60 * 60)
           )


dim(AllDays_NewTravelDist)
dim(NewTravTime)
rm(AllDays_NewTravelDist)

summary(select(NewTravTime,
           -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
          )
   )

str(select(NewTravTime,
           TravelTime_Sec,
           TT_Sec_New,
           TT_Sec_New_Label,
           TT_Hr_New
          )
   )


summary(select(NewTravTime,
               TravelTime_Sec,
               TT_Sec_New,
               TT_Sec_New_Label,
               TT_Hr_New
              )
       )

```


Test investigation of just the X2 Route. Box plots for time between bus arrivals (by HourGroup).
```{r}

View(head(select(NewTravTime,
                 -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
                )
         )
    )

X2 <- select(NewTravTime,
             -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
            ) %>% 
  filter(Route == "X2")

str(X2)

View(head(arrange(X2,
                  Bus_ID,
                  Event_Time
                 ),
          500
         )
    )

X2_ByStop <- group_by(X2,
                      StopID_Clean
                     ) %>% 
  arrange(StopID_Clean,
          Event_Time) %>% 
  mutate(Event_Time_L1 = lag(Event_Time),
         TimeToEvent_Sec = as.numeric(Event_Time - Event_Time_L1),
         TimeToEvent_Min = TimeToEvent_Sec / 60
        )

View(head(X2_ByStop, 500))


# Count_Values is needed to display the medians on the box plots
Count_Values <- ddply(as.data.frame(X2_ByStop),
                      .(Event_Time_HrGroup),
                      summarise,
                      Value_Counts = median(TimeToEvent_Min, na.rm = TRUE)
                     )

TimeBtwEvents_X2_BoxPlot <- ggplot(select(as.data.frame(X2_ByStop),
                                          TimeToEvent_Min,
                                          Event_Time_HrGroup
                                         ),
                                   aes(factor(Event_Time_HrGroup),
                                       TimeToEvent_Min,
                                       fill = factor(Event_Time_HrGroup)
                                      )
                                  ) + 
  geom_boxplot(outlier.colour="red", notch=TRUE, na.rm = TRUE) +
  geom_text(data = Count_Values,
            aes(y = Value_Counts,
                label = format(round(Value_Counts, digits = 1),
                               nsmall = 1
                              )
               ),
            size = 3,
            vjust = -0.5
           ) +
  theme(legend.position="none", axis.text.x = element_text(angle=45)) +
  coord_cartesian(# xlim = c(0, 180),
                  ylim = c(0, 120)
                 ) +
  labs(title = "How Often an X2 Arrives at a Given Stop",
       x = "Hour Group",
       y = "Time Between Busses (min)"
      )

TimeBtwEvents_X2_BoxPlot

```


Test investigation of just the X2 Route. Violin plots for time between bus arrivals (by Hour Group).
```{r}

TimeBtwEvents_X2_ViolinPlot <- ggplot(select(as.data.frame(X2_ByStop),
                                             TimeToEvent_Min,
                                             Event_Time_HrGroup
                                             ),
                                      aes(factor(Event_Time_HrGroup),
                                          TimeToEvent_Min,
                                          fill = factor(Event_Time_HrGroup)
                                         )
                                     ) + 
  geom_violin(draw_quantiles = c(0.25, 0.5, 0.75),
              trim = TRUE,
              scale = "count",
              na.rm = TRUE,
              show.legend = NA,
              inherit.aes = TRUE
             ) +
  geom_text(data = Count_Values,
            aes(y = Value_Counts,
                label = format(round(Value_Counts, digits = 1),
                               nsmall = 1
                              )
               ),
            size = 2.5,
            vjust = -0.5
           ) +
  theme(legend.position="none", axis.text.x = element_text(angle=45)) +
  coord_cartesian(# xlim = c(0, 180),
                  ylim = c(0, 80)
                 ) +
  labs(title = "How Often an X2 Arrives at a Given Stop",
       x = "Hour Group",
       y = "Time Between Busses (min)"
      )

TimeBtwEvents_X2_ViolinPlot

```


Test investigation of just the X2 Route. Box plots for time between bus arrivals (by Zip Code).
```{r}

# Count_Values is needed to display the medians on the box plots
Count_Values_z <- ddply(as.data.frame(X2_ByStop),
                        .(Stop_Zip),
                        summarise,
                        Value_Counts = median(TimeToEvent_Min, na.rm = TRUE)
                       )

TimeBtwEvents_X2_BoxPlot_z <- ggplot(select(as.data.frame(X2_ByStop),
                                            TimeToEvent_Min,
                                            Stop_Zip
                                           ),
                                     aes(factor(Stop_Zip),
                                         TimeToEvent_Min,
                                         fill = factor(Stop_Zip)
                                        )
                                    ) + 
  geom_boxplot(outlier.colour="red", notch=TRUE, na.rm = TRUE) +
  geom_text(data = Count_Values_z,
            aes(y = Value_Counts,
                label = format(round(Value_Counts, digits = 1),
                               nsmall = 1
                              )
               ),
            size = 3,
            vjust = -0.5
           ) +
  theme(legend.position="none", axis.text.x = element_text(angle=45)) +
  coord_cartesian(# xlim = c(0, 180),
                  ylim = c(0, 100)
                 ) +
  labs(title = "How Often an X2 Arrives at a Given Stop",
       x = "Zip Code of Destination",
       y = "Time Between Busses (min)"
      )

TimeBtwEvents_X2_BoxPlot_z

```


Test investigation of just the X2 Route. Violin plots for time between bus arrivals (by Zip Code).
```{r}

TimeBtwEvents_X2_ViolinPlot_z <- ggplot(select(as.data.frame(X2_ByStop),
                                               TimeToEvent_Min,
                                               Stop_Zip
                                               ),
                                        aes(factor(Stop_Zip),
                                            TimeToEvent_Min,
                                            fill = factor(Stop_Zip)
                                           )
                                       ) + 
  geom_violin(draw_quantiles = c(0.25, 0.5, 0.75),
              trim = TRUE,
              scale = "count",
              na.rm = TRUE,
              show.legend = NA,
              inherit.aes = TRUE
             ) +
  geom_text(data = Count_Values_z,
            aes(y = Value_Counts,
                label = format(round(Value_Counts, digits = 1),
                               nsmall = 1
                              )
               ),
            size = 2.5,
            vjust = -0.5
           ) +
  theme(legend.position="none", axis.text.x = element_text(angle=45)) +
  coord_cartesian(# xlim = c(0, 180),
                  ylim = c(0, 60)
                 ) +
  labs(title = "How Often an X2 Arrives at a Given Stop",
       x = "Zip Code of Destination",
       y = "Time Between Busses (min)"
      )

TimeBtwEvents_X2_ViolinPlot_z

```


Waiting time analyses.

Munging and sampling data to go from time beteen buses to "average" waiting time.

First, get the max and min times of bus stops (each day, and for each route).
```{r}

RouteMinMax <- group_by(NewTravTime,
                        Route,
                        Event_Time_Date
                       ) %>% 
  summarise(MinTime = min(Event_Time),
            MaxTime = max(Event_Time)
           )

str(RouteMinMax)
View(RouteMinMax)

```


Waiting time analyses.

Munging and sampling data to go from time beteen buses to "average" waiting time.

(Pulls here are done by day, as the data are too large to do at once.)
```{r}

# View(head(NewTravTime, 500))

# For each record, create a random datetime between the first and last stop for that bus route (on that day).
for(i in 3:7){

set.seed(123456789)
Samp <- select(NewTravTime,
               RowNum_OG,
               Route,
               # RouteGroup,
               Event_Time_Date,
               StopID_Clean,
               starts_with("Event")
              ) %>% 
  filter(Event_Time_Date == i) %>%  # needed to do this each day (3-7) because the complete file was too large to do at once
  left_join(RouteMinMax,
            by = c("Route" = "Route",
                   "Event_Time_Date" = "Event_Time_Date"
                  )
           ) %>% 
  mutate(SampTime = as_datetime(runif(nrow(.), #200000,
                                      min = MinTime,
                                      max = MaxTime
                                     ),
                                tz = "America/New_York"
                               )
        ) %>% 
  arrange(Route,
          StopID_Clean,
          Event_Time
         ) 

# str(Samp)
# View(head(Samp, 500))
# 
# View(
# group_by(Samp,
#          RowNum_OG
#         ) %>%
#   summarise(Cnt_Num = n(),
#             Cnt_Pct = 100 * Cnt_Num / nrow(Samp)
#            ) %>%
#   arrange(desc(Cnt_Num))
# )


# For each Route and StopID combination, get all the Event_Time values that are after the SampTime value.
# estimating approx 2hrs of runtime for all 2.8m records
Testing_A <- sqldf("   Select               t1.*
                                            ,t2.Event_Time             as NextBus
                        From                 Samp                      as t1
                             Inner Join      Samp                      as t2
                                On              t1.Route = t2.Route
                                And             t1.StopID_Clean = t2.StopID_Clean
                                And             t2.Event_Time > t1.SampTime
                        Order By             t1.Route
                                            ,t1.StopID_Clean
                                            ,t1.Event_Time
                                            ,t2.Event_Time
                  "
                 ) %>% 
  mutate(NB = as_datetime(NextBus,
                          tz = "America/New_York"
                         )
        )

# str(Testing_A)
# View(head(Testing_A, 500))
# View(head(Samp, 500))


# Filter the dataframe to only include the bus arrival at StopID that is the next to come after the SampTime.
# estimating approx 20min of runtime for all 2.8m records
Testing <- select(Testing_A,
                  -NextBus
                 ) %>% 
  group_by(RowNum_OG) %>% 
  filter(NB == min(NB)
        ) %>% 
  arrange(Route,
          StopID_Clean,
          Event_Time
         ) %>% 
  mutate(WaitTime_Min = as.numeric(NB - SampTime),
         WaitTime_Sec = WaitTime_Min * 60,
         WaitTime_Sec2 = NB - SampTime,
         WaitTime_Min2 = WaitTime_Sec2 / 60
        ) %>% 
  as.data.frame()

assign(paste0("Testing_", i),
       Testing
      )

rm(Samp,Testing_A, Testing)
str(get(paste0("Testing_", i)))
View(get(paste0("Testing_", i)))
}


# Bind all the individual dataframes together.
WaitData_DayPull <- bind_rows(Testing_3,
                              Testing_4,
                              Testing_5,
                              Testing_6,
                              Testing_7
                             ) %>% 
  mutate(WaitTime_Sec3 = NB - SampTime,
         WaitTime_Min3 = WaitTime_Sec3 / 60
        ) %>% 
  arrange(Route,
          StopID_Clean,
          Event_Time
         )

rm(Testing_3, Testing_4, Testing_5, Testing_6, Testing_7)
str(WaitData_DayPull)
View(head(WaitData_DayPull, 500))
View(tail(WaitData_DayPull, 500))

```


Waiting time analyses.

Munging and sampling data to go from time beteen buses to "average" waiting time.

Basic investigation of any missing rows from data pulled by day.
```{r}

DistinctRowNum_OG <- distinct(select(WaitData_DayPull,
                                     RowNum_OG
                                    )
                             )

str(DistinctRowNum_OG)

View(
anti_join(Samp,
          DistinctRowNum_OG,
          by = c("RowNum_OG" = "RowNum_OG")
         )
)


# The samp time is AFTER the last bus passed that StopID_Clean
View(filter(Samp,
            Event_Time > "2016-10-07 19:48:41" &
              Route == "X2" &
              StopID_Clean == 1003774
           )
    )

# Next Bus (NB) can be on the next morning
View(filter(Testing7,
            SampTime > "2016-10-06 23:58:00" &
              SampTime < "2016-10-06 23:59:59")
    )

```


Waiting time analyses.

Munging and sampling data to go from time beteen buses to "average" waiting time.

(Pulls here are done by groupings of bus routes, as the data are too large to do at once.)

First, we need to find the most common bus routes.
```{r}

rm(DistinctRowNum_OG)


# View(head(NewTravTime, 500))

set.seed(123456789)
BusGroups <- group_by(NewTravTime,
                      Route
                     ) %>% 
  summarise(Cnt_Num = n(),
            Cnt_Pct = Cnt_Num / nrow(NewTravTime)
           ) %>% 
  arrange(desc(Cnt_Num)
         ) %>% 
  mutate(RowNum = row_number(),
         RandNum = runif(n = 268),
         RouteGroup = ifelse(RandNum <= 0.2,
                             1,
                      ifelse(RandNum <= 0.4,
                             2,
                      ifelse(RandNum <= 0.6,
                             3,
                      ifelse(RandNum <= 0.8,
                             4,
                             5
                            ))))
        )

str(BusGroups)
View(BusGroups)
summary(BusGroups)

```


Waiting time analyses.

Munging and sampling data to go from time beteen buses to "average" waiting time.

(Pulls here are done by groupings of bus routes, as the data are too large to do at once.)
```{r}

# View(head(NewTravTime, 500))

# For each record, create a random datetime between the first and last stop for that bus route (on that day).
for(i in 1:5){
  
set.seed(123456789)
Samp <- left_join(NewTravTime,
                  BusGroups,
                  by = c("Route" = "Route")
                  ) %>% 
  select(RowNum_OG,
         Route,
         RouteGroup,
         Event_Time_Date,
         StopID_Clean,
         starts_with("Event")
        ) %>% 
  filter(RouteGroup == i) %>%  # needed to do this each RouteGroup (1-5) because the complete file was too large to do at once
  left_join(RouteMinMax,
            by = c("Route" = "Route",
                   "Event_Time_Date" = "Event_Time_Date"
                  )
           ) %>% 
  mutate(SampTime = as_datetime(runif(nrow(.), #200000,
                                      min = MinTime,
                                      max = MaxTime
                                     ),
                                tz = "America/New_York"
                               )
        ) %>% 
  arrange(Route,
          StopID_Clean,
          Event_Time
         ) 

# str(Samp)
# View(head(Samp, 500))
# 
# View(
# group_by(Samp,
#          RowNum_OG
#         ) %>%
#   summarise(Cnt_Num = n(),
#             Cnt_Pct = 100 * Cnt_Num / nrow(Samp)
#            ) %>%
#   arrange(desc(Cnt_Num))
# )


# For each Route and StopID combination, get all the Event_Time values that are after the SampTime value.
# estimating approx 2hrs of runtime for all 2.8m records
Testing_A <- sqldf("   Select               t1.*
                                            ,t2.Event_Time             as NextBus
                        From                 Samp                      as t1
                             Inner Join      Samp                      as t2
                                On              t1.Route = t2.Route
                                And             t1.StopID_Clean = t2.StopID_Clean
                                And             t2.Event_Time > t1.SampTime
                        Order By             t1.Route
                                            ,t1.StopID_Clean
                                            ,t1.Event_Time
                                            ,t2.Event_Time
                  "
                 ) %>% 
  mutate(NB = as_datetime(NextBus,
                          tz = "America/New_York"
                         )
        )

# str(Testing_A)
# View(head(Testing_A, 500))
# View(head(Samp, 500))


# Filter the dataframe to only include the bus arrival at StopID that is the next to come after the SampTime.
# estimating approx 20min of runtime for all 2.8m records
Testing <- select(Testing_A,
                  -NextBus
                 ) %>% 
  group_by(RowNum_OG) %>% 
  filter(NB == min(NB)
        ) %>% 
  arrange(Route,
          StopID_Clean,
          Event_Time
         ) %>% 
  mutate(WaitTime_Min = as.numeric(NB - SampTime),
         WaitTime_Sec = WaitTime_Min * 60
        ) %>% 
  as.data.frame()

assign(paste0("Testing", i),
       Testing
      )

rm(Samp,Testing_A, Testing)
str(get(paste0("Testing", i)))
View(get(paste0("Testing", i)))
}


# Bind all the individual dataframes together.
WaitData_RoutePull <- bind_rows(Testing1,
                                Testing2,
                                Testing3,
                                Testing4,
                                Testing5
                             ) %>% 
  mutate(WaitTime_Sec2 = NB - SampTime,
         WaitTime_Min2 = WaitTime_Sec2 / 60
        ) %>% 
  arrange(Route,
          StopID_Clean,
          Event_Time
         )

rm(BusGroups, i, Testing3, Testing4, Testing5, Testing6, Testing7)
str(WaitData_RoutePull)
View(head(WaitData_RoutePull, 500))
View(tail(WaitData_RoutePull, 500))

```


Waiting time analyses.

Munging and sampling data to go from time beteen buses to "average" waiting time.

Compare WaitData pulled by day and pulled by route.
```{r}

dim(WaitData_RoutePull)
dim(WaitData_DayPull)
nrow(WaitData_RoutePull) - nrow(WaitData_DayPull)

WaitData_Diff <- anti_join(WaitData_RoutePull,
                           WaitData_DayPull,
                           by = c("RowNum_OG" = "RowNum_OG"
                                 )
                          ) %>% 
  select(-WaitTime_Min,
         -WaitTime_Sec
        )

str(WaitData_Diff)
View(head(WaitData_Diff, 500))

View(filter(WaitData_RoutePull,
            Route == "Z8" &
              StopID_Clean == 2005465
            # RowNum_OG = 2902760
            # Event_Time = 2016-10-07 19:51:47
           )
    )

View(group_by(WaitData_Diff,
              Route
             ) %>% 
       summarise(Cnt_Num = n(),
                 Cnt_Pct = Cnt_Num / nrow(WaitData_Diff)
                ) %>% 
       arrange(desc(Cnt_Num)
              )
    )

View(filter(WaitData_Diff,
            Route == "S1"
           )
    )

View(filter(WaitData_RoutePull,
            Route == "S1" &
              StopID_Clean == 1003132
            # RowNum_OG = 1151770
            # Event_Time = 2016-10-07 09:07:12
           )
    )

# Can't tell why the pull by day has less records than the pull by route

```


Waiting time analyses.

Munging and sampling data to go from time beteen buses to "average" waiting time.

Compare WaitData (pulled by route) and original data (NewTravTime).
```{r}

# rm(WaitData_Diff)


dim(NewTravTime)  # 2,809,529 rows
dim(WaitData_RoutePull)  # 2,780,848 rows
nrow(NewTravTime) - nrow(WaitData_RoutePull)  # is 28,681 rows

str(select(NewTravTime,
           -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
          )
   )
str(WaitData_RoutePull)

Compare_NTT_WD <- left_join(NewTravTime,
                            select(WaitData_RoutePull,
                                   RowNum_OG,
                                   # Route,
                                   RouteGroup,
                                   # StopID_Clean,
                                   # Event_Time,
                                   MinTime,
                                   MaxTime,
                                   SampTime,
                                   NB,
                                   WaitTime_Sec2,
                                   WaitTime_Min2
                                  ),
                            by = c("RowNum_OG" = "RowNum_OG")
                           ) %>% 
  select(-matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
        ) %>% 
  arrange(Route,
          StopID_Clean,
          Event_Time
         )

str(Compare_NTT_WD)  # 2,810,109 rows overall  --  29,261 rows with no match
View(head(Compare_NTT_WD, 500))
View(filter(Compare_NTT_WD,
            is.na(MinTime)
           )
    )



View(anti_join(Samp,
               distinct(select(WaitData_RoutePull,
                               RowNum_OG
                              )
                       ),
               by = c("RowNum_OG" = "RowNum_OG")
              )
    )

# The SampTime is AFTER the last bus passed that StopID_Clean
View(filter(Samp,
              Route == "X2" &
              StopID_Clean == 1003774
            # RowNum_OG = 1146723
            # Event_Time = 2016-10-07 15:32:18
           )
    )

```


Clean up the data a bit.
```{r}

rm(BusGroups, RouteMinMax, Samp, Testing1, Testing2, Testing3, Testing4, Testing5, Testing_3, Testing_4, Testing_5, Testing_6, Testing_7, WaitData_DayPull, WaitData_Diff)


str(Compare_NTT_WD)
View(head(Compare_NTT_WD, 500))
View(head(mutate(Compare_NTT_WD,
                 WT_Min = as.numeric(WaitTime_Min2)
                )
         )
    )

WaitTime_AsNum <- Compare_NTT_WD %>% 
  mutate(RouteStop_ID = factor(paste(Route, StopID_Clean, sep = "__")
                              )
        )
WaitTime_AsNum$WaitTime_Sec2 <- as.numeric(WaitTime_AsNum$WaitTime_Sec2)
WaitTime_AsNum$WaitTime_Min2 <- as.numeric(WaitTime_AsNum$WaitTime_Min2)

rm(Compare_NTT_WD)
str(WaitTime_AsNum)

```


General exploration of wait times.
```{r}

summary(WaitTime_AsNum$WaitTime_Min2)

WT_Quantiles <- as.data.frame(quantile(WaitTime_AsNum$WaitTime_Min2,
                                       probs = seq(0, 1, 0.01),
                                       na.rm = TRUE
                                      )
                             )

colnames(WT_Quantiles) <- "Value_Min"

WT_Quantiles$Value_Sec = format(round(WT_Quantiles$Value_Min * 60,
                                      digits = 2
                                     ),
                                nsmall = 2
                               )
WT_Quantiles$Value_Hr = format(round(WT_Quantiles$Value_Min / 60,
                                     digits = 2
                                    ),
                                nsmall = 2
                               )
WT_Quantiles$Value_Min = format(round(WT_Quantiles$Value_Min,
                                      digits = 2
                                     ),
                                nsmall = 2
                               )

WT_Quantiles$Quantile <- seq(0, 1, 0.01)

WT_Quantiles <- select(WT_Quantiles,
                       Quantile,
                       Value_Sec,
                       Value_Min,
                       Value_Hr
                      )

str(WT_Quantiles)
View(WT_Quantiles)


View(arrange(WaitTime_AsNum,
             desc(WaitTime_Min2)
            ) %>% 
       head(., 5000)
    )

View(filter(WaitTime_AsNum,
            between(WaitTime_Min2, 60, 200)
           ) %>% 
       arrange(desc(WaitTime_Min2)
              ) 
     # %>% 
     #   head(., 5000)
    )

# Example of extreme wait times
View(filter(WaitTime_AsNum,
            Route == "W13" &  # only 2 bus passes in the entire dataset
              StopID_Clean == 1003728
            # Event_Time = 2016-10-03 08:42:46
           )
    )

# Example of extreme wait times
View(filter(WaitTime_AsNum,
            Route == "S41" &  # only 4 bus passes in the entire dataset
              StopID_Clean == 1001095
            # Event_Time = 2016-10-05 15:41:47
           )
    )

# Example of extreme wait times
View(filter(WaitTime_AsNum,
            Route == "D8" &  # route has VERY limited service after midnight
              StopID_Clean == 1001669
            # Event_Time = 2016-10-06 20:31:16
           )
    )

```


Looks like there might be an issue in wait times when very few Route-Stop combinations are included in the dataset.  Let's explore these.
```{r}

RouteStop_Cnts <- group_by(WaitTime_AsNum,
                           RouteStop_ID
                          ) %>% 
  summarise(RouteStop_CntNum = n(),
            RouteStop_CntPct = RouteStop_CntNum / nrow(WaitTime_AsNum)
           ) %>% 
  arrange(RouteStop_CntNum)

View(RouteStop_Cnts)


RouteStop_CntOfCnt <- group_by(RouteStop_Cnts,
                               RouteStop_CntNum
                              ) %>% 
  summarise(RouteStopCnt_CntNum = n(),
            RouteStopCnt_CntPct = RouteStopCnt_CntNum / nrow(RouteStop_Cnts)
           ) %>% 
  mutate(RouteStopCnt_CntPct_CumSum = cumsum(RouteStopCnt_CntPct),
         x = 1 - RouteStopCnt_CntPct_CumSum
        ) %>% 
  arrange(RouteStop_CntNum)
  
 View(RouteStop_CntOfCnt)

```


Histogram of the counts of Route-StopID combinations.
```{r}

RouteStop_Cnts_Bar <- ggplot(RouteStop_CntOfCnt,
                             aes(x = RouteStop_CntNum,
                                 # y = ..density..
                                 y = RouteStopCnt_CntNum
                                )
                            ) +
  # geom_histogram(binwidth = 5, fill = "lightblue", colour = "grey60", size = 0.2) +
  geom_col(fill = "lightblue", colour = "grey60", size = 0.2) +
  coord_cartesian(xlim = c(0, 500)
                  # ylim = c(0, 0.02)
                 ) +
  labs(title = "Variation in Routes Passing a Specific Stop",
       x = "Occurrences of Route-StopID Combiantions",
       y = "Counts"
      )

RouteStop_Cnts_Bar

```


Create a new dataset limiting extremely small counts of Route-StopID combinations.
```{r}

WaitTime_RteCnts <- left_join(WaitTime_AsNum,
                              RouteStop_Cnts,
                              by = c("RouteStop_ID" = "RouteStop_ID")
                             ) %>% 
  select(-RouteStop_CntPct)

dim(WaitTime_AsNum)
dim(WaitTime_RteCnts)

rm(WaitTime_AsNum)
str(WaitTime_RteCnts)


# Total rows
nrow(WaitTime_RteCnts)

# Rows of rare RouteStops
nrow(filter(WaitTime_RteCnts,
            RouteStop_CntNum <= 60
           )
    ) / nrow(WaitTime_RteCnts)

# Rows of extremely long wait times
nrow(filter(WaitTime_RteCnts,
            WaitTime_Min2 > 180
           )
    ) / nrow(WaitTime_RteCnts)


select(WaitTime_RteCnts,
       WaitTime_Min2
      ) %>% 
  summary()

filter(WaitTime_RteCnts,
       RouteStop_CntNum > 60  # 12 passes per day in a 5-day dataset
      ) %>% 
  select(WaitTime_Min2) %>% 
  summary()

filter(WaitTime_RteCnts,
       WaitTime_Min2 < 180  # probably means that something went wrong
      ) %>% 
  select(WaitTime_Min2) %>% 
  summary()



a <- as.data.frame(select(WaitTime_RteCnts,
                          WaitTime_Min2
                         ) %>% 
                     quantile(probs = seq(0, 1, 0.01), na.rm = TRUE)
                  )

b <- as.data.frame(filter(WaitTime_RteCnts,
                          RouteStop_CntNum > 60
                         ) %>% 
                     select(WaitTime_Min2) %>% 
                     quantile(probs = seq(0, 1, 0.01), na.rm = TRUE)
                  )

c <- as.data.frame(filter(WaitTime_RteCnts,
                          WaitTime_Min2 < 180
                         ) %>% 
                     select(WaitTime_Min2) %>% 
                     quantile(probs = seq(0, 1, 0.01), na.rm = TRUE)
                  )

WT_Filter_Quantiles <- bind_cols(a, b, c) %>% 
  mutate(Quantile = seq(0, 1, 0.01)
        )

colnames(WT_Filter_Quantiles) <- c("All", "RteStpAbv60", "WTBlw180", "Quantile")
rm(a, b, c)
View(WT_Filter_Quantiles)

```


Histogram of all wait times.
```{r}

WaitTime_AllBus_HistDen <- ggplot(filter(select(WaitTime_RteCnts,
                                                WaitTime_Min2
                                               ),
                                         !is.na(WaitTime_Min2)
                                        ),
                                  aes(x = WaitTime_Min2,
                                      y = ..density..
                                     )
                                ) +
  geom_histogram(binwidth = 5, fill = "lightblue", colour = "grey60", size = 0.2) +
  geom_line(stat = "density", colour = "red") +
  scale_x_continuous(breaks = seq(0, 300, 30)
                    ) +
  coord_cartesian(xlim = c(0, 300),
                  ylim = c(0, 0.035)
                 ) +
  labs(title = "Variation in Wait Time",
       x = "Wait Time (min)",
       y = "Density"
      )

WaitTime_AllBus_HistDen

```


Box plots for WaitTime (all busses, by Zip Code).
```{r}

# Count_Values is needed to display the medians on the box plots
BusRoute <- select(WaitTime_RteCnts,
                   Route,
                   WaitTime_Min2,
                   Stop_Zip
                  ) %>% 
  filter(Route == "X2")

CountValues_AllBus_Zip <- ddply(BusRoute,
                                .(Stop_Zip),
                                summarise,
                                Value_Counts = median(WaitTime_Min2, na.rm = TRUE)
                               )

WaitTime_AllBus_Zip_Box <- ggplot(BusRoute,
                                  aes(factor(Stop_Zip),
                                      WaitTime_Min2,
                                      fill = factor(Stop_Zip)
                                     )
                                 ) + 
  geom_boxplot(outlier.colour="red", notch=TRUE, na.rm = TRUE) +
  geom_text(data = CountValues_AllBus_Zip,
            aes(y = Value_Counts,
                label = format(round(Value_Counts, digits = 1),
                               nsmall = 1
                              )
               ),
            size = 3,
            vjust = -0.5
           ) +
  theme(legend.position="none", axis.text.x = element_text(angle=45)) +
  coord_cartesian(# xlim = c(0, 180),
                  ylim = c(0, 45)
                 ) +
  labs(title = "Waiting Time at a Given Stop (for the X2)",
       x = "Zip Code of Destination",
       y = "Waiting Time (min)"
      )

WaitTime_AllBus_Zip_Box

```


Test investigation of just the X2 Route. Violin plots for time between bus arrivals (by Zip Code).
```{r}

WaitTime_AllBus_Zip_Violin <- ggplot(BusRoute,
                                     aes(factor(Stop_Zip),
                                         WaitTime_Min2,
                                         fill = factor(Stop_Zip)
                                        )
                                    ) + 
  geom_violin(draw_quantiles = c(0.25, 0.5, 0.75),
              trim = TRUE,
              scale = "count",
              na.rm = TRUE,
              show.legend = NA,
              inherit.aes = TRUE
             ) +
  geom_text(data = CountValues_AllBus_Zip,
            aes(y = Value_Counts,
                label = format(round(Value_Counts, digits = 1),
                               nsmall = 1
                              )
               ),
            size = 3.5,
            vjust = -0.5
           ) +
  theme(legend.position="none", axis.text.x = element_text(angle=45)) +
  coord_cartesian(# xlim = c(0, 180),
                  ylim = c(0, 45)
                 ) +
  labs(title = "Waiting Time at a Given Stop (for the X2)",
       x = "Zip Code of Destination",
       y = "Waiting Time (min)"
      )

TimeBtwEvents_X2_ViolinPlot_z

```


Box plots for WaitTime (Zip Code, by HourGroupZip).
```{r}

# Count_Values is needed to display the medians on the box plots
Zip <- select(WaitTime_RteCnts,
              Route,
              WaitTime_Min2,
              Stop_Zip,
              Event_Time_HrGroup
             ) %>% 
  filter(Stop_Zip == 20002)

CountValues_AllBus_HG <- ddply(Zip,
                               .(Event_Time_HrGroup),
                               summarise,
                               Value_Counts = median(WaitTime_Min2,
                                                     na.rm = TRUE
                                                    )
                               )

WaitTime_AllBus_HG_Box <- ggplot(Zip,
                                 aes(factor(Event_Time_HrGroup),
                                     WaitTime_Min2,
                                     fill = factor(Event_Time_HrGroup)
                                    )
                                ) + 
  geom_boxplot(outlier.colour="red", notch=TRUE, na.rm = TRUE) +
  geom_text(data = CountValues_AllBus_HG,
            aes(y = Value_Counts,
                label = format(round(Value_Counts, digits = 1),
                               nsmall = 1
                              )
               ),
            size = 2.5,
            vjust = -0.5
           ) +
  theme(legend.position="none", axis.text.x = element_text(angle=45)) +
  coord_cartesian(# xlim = c(0, 180),
                  ylim = c(0, 45)
                 ) +
  labs(title = "Waiting Time at a Given Stop (for Zip 20002)",
       x = "Hour Group",
       y = "Waiting Time (min)"
      )
  # facet_wrap(~Stop_Zip
  #            # nrow = 5
  #           )

WaitTime_AllBus_HG_Box

```


Violin plots for WaitTime (Zip Code, by HourGroupZip).
```{r}

WaitTime_AllBus_HG_Vln <- ggplot(Zip,
                                 aes(factor(Event_Time_HrGroup),
                                     WaitTime_Min2,
                                     fill = factor(Event_Time_HrGroup)
                                    )
                                ) + 
  geom_violin(draw_quantiles = c(0.25, 0.5, 0.75),
              trim = TRUE,
              scale = "count",
              na.rm = TRUE,
              show.legend = NA,
              inherit.aes = TRUE
             ) +
  geom_text(data = CountValues_AllBus_HG,
            aes(y = Value_Counts,
                label = format(round(Value_Counts, digits = 1),
                               nsmall = 1
                              )
               ),
            size = 2.5,
            vjust = -0.5
           ) +
  theme(legend.position="none", axis.text.x = element_text(angle=45)) +
  coord_cartesian(# xlim = c(0, 180),
                  ylim = c(0, 90)
                 ) +
  labs(title = "Waiting Time at a Given Stop (for Zip 20002)",
       x = "Hour Group",
       y = "Waiting Time (min)"
      )
  # facet_wrap(~Stop_Zip
  #            # nrow = 5
  #           )

WaitTime_AllBus_HG_Vln

```


Box plots for WaitTime (Route, by HourGroupZip).
```{r}

# Count_Values is needed to display the medians on the box plots
Rte <- select(WaitTime_RteCnts,
              Route,
              WaitTime_Min2,
              Stop_Zip,
              Event_Time_HrGroup
             ) %>% 
  filter(Route == "X2")

CountValues_AllBus_RteHG <- group_by(Rte,
                                     Event_Time_HrGroup
                                    ) %>% 
  summarise(
                                      Value_Counts = median(WaitTime_Min2,
                                                       na.rm = TRUE
                                                      ),
                                      VC = quantile(WaitTime_Min2, probs = 0.9,
                                                    na.rm = TRUE)
                                      )
  
  # ddply(Rte,
  #                                .(Event_Time_HrGroup),
  #                                summarise,
  #                                Value_Counts = median(WaitTime_Min2,
  #                                                      na.rm = TRUE
  #                                                     )
  #                               )

WaitTime_AllBus_RteHG_Box <- ggplot(Rte,
                                    aes(factor(Event_Time_HrGroup),
                                        WaitTime_Min2,
                                        fill = factor(Event_Time_HrGroup)
                                       )
                                   ) + 
  geom_boxplot(outlier.colour="red", notch=TRUE, na.rm = TRUE) +
  geom_text(data = CountValues_AllBus_RteHG,
            aes(y = Value_Counts,
                label = format(round(Value_Counts, digits = 1),
                               nsmall = 1
                              )
               ),
            size = 2.5,
            vjust = -0.5
           ) +
  theme(legend.position="none", axis.text.x = element_text(angle=45)) +
  coord_cartesian(# xlim = c(0, 180),
                  ylim = c(0, max(CountValues_AllBus_RteHG$VC))
                 ) +
  labs(title = "Waiting Time at a Given Stop",
       subtitle = ("Route X2"),
       x = "Hour Group",
       y = "Waiting Time (min)"
      ) 
# +
#   facet_wrap(~Stop_Zip
#              # nrow = 5
#             )

WaitTime_AllBus_RteHG_Box

```


Violin plots for WaitTime (Zip Code, by HourGroupZip).
```{r}

WaitTime_AllBus_RteHG_Vln <- ggplot(Rte,
                                    aes(factor(Event_Time_HrGroup),
                                        WaitTime_Min2,
                                        fill = factor(Event_Time_HrGroup)
                                       )
                                   ) + 
  geom_violin(draw_quantiles = c(0.25, 0.5, 0.75),
              trim = TRUE,
              scale = "count",
              na.rm = TRUE,
              show.legend = NA,
              inherit.aes = TRUE
             ) +
  geom_text(data = CountValues_AllBus_RteHG,
            aes(y = Value_Counts,
                label = format(round(Value_Counts, digits = 1),
                               nsmall = 1
                              )
               ),
            size = 2.5,
            vjust = -0.5
           ) +
  theme(legend.position="none", axis.text.x = element_text(angle=45)) +
  coord_cartesian(# xlim = c(0, 180),
                  ylim = c(0, 45)
                 ) +
  labs(title = "Waiting Time at a Given Stop",
       subtitle = ("(Route X2)"),
       x = "Hour Group",
       y = "Waiting Time (min)"
      ) +
  facet_wrap(~Stop_Zip
             # nrow = 5
            )

WaitTime_AllBus_RteHG_Vln

```


X2 Percentiles Line Graph Test.
```{r}

X2_Pct <- select(WaitTime_RteCnts,
                 Route,
                 Stop_Zip,
                 Event_Time_Date,
                 Event_Time_Day,
                 Event_Time_HrGroup,
                 Event_Time_Hr,
                 Latitude,
                 Longitude,
                 WaitTime_Min2
                ) %>% 
  filter(Route == "X2") %>% 
  group_by(Event_Time_Hr,
           Stop_Zip
          ) %>% 
  summarise(Pct50 = quantile(WaitTime_Min2, probs = 0.5, na.rm = TRUE),
            Pct60 = quantile(WaitTime_Min2, probs = 0.6, na.rm = TRUE),
            Pct70 = quantile(WaitTime_Min2, probs = 0.7, na.rm = TRUE),
            Pct80 = quantile(WaitTime_Min2, probs = 0.8, na.rm = TRUE),
            Pct90 = quantile(WaitTime_Min2, probs = 0.9, na.rm = TRUE)
           )

str(X2_Pct)
View(X2_Pct)


X2_Long <- gather(X2_Pct,
                  key = Percentile,
                  value = Pctile,
                  Pct50,
                  Pct60,
                  Pct70,
                  Pct80,
                  Pct90
                )

str(X2_Long)
View(X2_Long)


X2_WaitByHr_Line <- ggplot(X2_Long,
                           aes(x = Event_Time_Hr,
                               y = Pctile,
                               factor(Percentile),
                               color = Percentile
                              )
                          ) +
  geom_line() +
  theme(legend.title=element_blank(),
        legend.position = "bottom"
       ) +
  coord_cartesian(xlim = c(0, 23)
                  # ylim = c(0, 45)
                 ) + 
  scale_x_continuous(breaks = seq(0, 23, 2)
                    ) +
  labs(title = "Waiting Time Throughout the Day",
       subtitle = ("(Route X2)"),
       x = "Hour of the Day",
       y = "Waiting Time (min)"
      ) +
  facet_wrap(~Stop_Zip)

X2_WaitByHr_Line

```





GET DATA READY FOR SHINY
```{r}

# str(WaitTime_RteCnts)

Shiny_WaitData_Base <- select(WaitTime_RteCnts,
                              Route,
                              Stop_Zip,
                              Event_Time_Date,
                              Event_Time_Day,
                              Event_Time_HrGroup,
                              Event_Time_Hr,
                              Latitude,
                              Longitude,
                              WaitTime_Min2
                             )

Shiny_WaitData_Base$Route <- factor(Shiny_WaitData_Base$Route)

str(Shiny_WaitData_Base)

write.table(Shiny_WaitData_Base,
            "Shiny_WaitData_Base.txt",
            quote = FALSE,
            sep = "\t",
            row.names = FALSE
           )
# rm(Shiny_WaitData_Base)

```



Testing mapping functionaltiy
```{r}

# devtools::install_github("dkahle/ggmap")
# devtools::install_github("hadley/ggplot2")
# install.packages("ggmap", type = "source")

# devtools::install_github('hadley/ggplot2')
devtools::install_github("hadley/ggplot2@v2.2.0")
# devtools::install_github('thomasp85/ggforce')
# devtools::install_github('thomasp85/ggraph')
# devtools::install_github('slowkow/ggrepel')







tract <- 
  readOGR(dsn = "/Users/mdturse/Desktop/Analytics/DCMetroBus/cb_2015_11_tract_500k",
          layer = "cb_2015_11_tract_500k"
         )
  
class(tract)


# convert the GEOID to a character
tract@data$GEOID <- as.character(tract@data$GEOID)

str(tract@data)



ggtract <- tidy(tract)#, region = "GEOID")
str(ggtract)
summary(ggtract)



map <- get_map(location = c(lon = -77.03676, lat = 38.89784),
               source = "google",
               # maptype = "roadmap"
               zoom = 12
              )

ggmap(map) +
  geom_polygon(aes(x = long, y = lat, group = group), 
               data = ggtract,
               colour = 'white', 
               fill = 'black', 
               alpha = .4, 
               size = .3
              )
  






ZipWaitTest <- filter(Shiny_WaitData_Base,
                      WaitTime_Min2 <= 180 &
                        !is.na(Stop_Zip)
                     ) %>% 
  group_by(Stop_Zip,
           Event_Time_Hr
          ) %>% 
  summarise(Pct80 = quantile(WaitTime_Min2, probs = 0.8, na.rm = TRUE)
           ) %>% 
  arrange(Event_Time_Hr,
          Stop_Zip
         )


View(head(ZipWaitTest, 500))




```


Investigating TravelTime_Sec.
```{r}

View(filter(TTLargeRteChng,
            !is.na(TravelTime_Sec) &
              RteChange2 == "Same"
           ) %>% 
       arrange(desc(TravelTime_Sec),
               SpeedAvg_Mph_NewHvrs
              ) %>%
       head(500)
    )


# examples where TravelTime_Sec is small (1 sec) and SpeedAvg_Mph_NewHvrs is large.
View(select(NewTravTime,
            # -matches("(q(2|5|(95)|(98)))|Mean|Med|Cnt")
            -(TD_Mi_q2:TD_Mi_SSHG_Cnt_F),
            -(TT_Hr_q2:TT_Hr_SSHG_Cnt_F)
           ) %>% 
       filter((RowNum_OG >= 2217353 & RowNum_OG <= 2217373) | # 2217363
                (RowNum_OG >= 3090321 & RowNum_OG <= 3090341) | # 3090331
                (RowNum_OG >= 80764 & RowNum_OG <= 80784) | # 80774
                (RowNum_OG >= 33840 & RowNum_OG <= 33860) # 33850
           )
    )






# examples where TravelTime_Sec is large and SpeedAvg_Mph_NewHvrs is small.
View(filter(TTLargeRteChng,
            (RowNum_OG >= 2250290 & RowNum_OG <= 2250310) | # 2250300
              (RowNum_OG >= 867717 & RowNum_OG <= 867737) | # 867727
              (RowNum_OG >= 864379 & RowNum_OG <= 864399) | # 864389
              (RowNum_OG >= 808395 & RowNum_OG <= 808415) # 808405
           )
    )
```




```{r}

         
         
# examples where TravelTime_Sec is unusually small (with TravelDistance_Mi values that are large).
View(filter(AllDays_NewTravelDist,
            (RowNum_OG >= 1042228 & RowNum_OG <= 1042248) | # 1042238
                (RowNum_OG >= 53816 & RowNum_OG <= 53836) | # 53826
                (RowNum_OG >= 360571 & RowNum_OG <= 360591) | # 360581
                (RowNum_OG >= 502271 & RowNum_OG <= 502291) # 502281 (can't explian the weird TravelTime_Sec calculation here - it's not even an integer!)
           )
    )

# still trying to explain 502281...on the day of this weirdness, the bus was only in circulation for 4-5 stops (~20 minutes) on that day (Oct 6)
View(filter(AllDays_NewTravelDist,
            Bus_ID == 2711
           )
    )


# exploring large values for TravelTime_Sec
View(filter(AllDays_NewTravelDist,
            TravelTime_Sec == 300
           ) %>% 
       arrange(desc(TravelTime_Sec),
               SpeedAvg_Mph2
              )
    )

# examples where TravelTime_Sec is unusually large (with TravelDistance_Mi values that are small, so SpeedAvg_Mph values are very small).
View(filter(AllDays_NewTravelDist,
            (RowNum_OG >= 2627459 & RowNum_OG <= 2627479) | # 2627469
                (RowNum_OG >= 2193344 & RowNum_OG <= 2193364) | # 2193354
                (RowNum_OG >= 1644123 & RowNum_OG <= 1644143) | # 1644133
                (RowNum_OG >= 869600 & RowNum_OG <= 869620) # 869610
           )
    )

```

Investigation of SpeedAvg_Mph2

View(Speed_Pctiles): 90% of SpeedAvg_Mph2 are between ~3mph and ~66mph.
```{r}

Speed_Ntile <- as.data.frame(AllDays_NewTravelDist$SpeedAvg_Mph2) %>% 
  mutate(Pctile = ntile(AllDays_NewTravelDist$SpeedAvg_Mph2, 100),
         MinR = min_rank(AllDays_NewTravelDist$SpeedAvg_Mph2),
         PctR = percent_rank(AllDays_NewTravelDist$SpeedAvg_Mph2),
         PctR_Round = round(PctR, 2)
        ) 

colnames(Speed_Ntile)[1] <- "SpeedAvg_Mph2"
str(Speed_Ntile)

Speed_Ntile_Rows <- nrow(Speed_Ntile)

View(tail(Speed_Ntile, 500))


Speed_Pctiles <- group_by(Speed_Ntile,
                          PctR_Round
                         ) %>% 
  summarise(
    MinSpeedAtPctile = min(SpeedAvg_Mph2),
    CntsAtPctile = n(),
    PctsAtPctile = CntsAtPctile / Speed_Ntile_Rows
  ) %>% 
  mutate(CumSumPAtP = cumsum(PctsAtPctile)
        )

View(Speed_Pctiles)

```

Investigation of SpeedAvg_Mph2.

Exploring the removal of outlier TravelTime_Sec and TravelDistance_Mi.
```{r}

summary(select(AllDays_NewTravelDist,
               SpeedAvg_Mph,
               SpeedAvg_Mph2
              )
       )

summary(select(filter(AllDays_NewTravelDist,
                      TravelDistance_Mi > 0.0001893939 & # lowest non-zero percentile
                        TravelDistance_Mi < 1.0812500000 & # 99th percentile
                        TravelTime_Sec > 10.050000 & # 2nd percentile
                        TravelTime_Sec < 293.000000 # 98th percentile
                     ),
               SpeedAvg_Mph,
               SpeedAvg_Mph2
              )
       )

```


Investigation of SpeedAvg_Mph2.

Histogram of SpeedAvg_Mph2.
```{r}

Speed_HistDen <- ggplot(filter(AllDays_NewTravelDist,
                               !is.na(SpeedAvg_Mph2)
                              ),
                        aes(x = SpeedAvg_Mph2,
                            y = ..density..
                           )
                       ) +
  geom_histogram(binwidth = 5, fill = "lightblue", colour = "grey60", size = 0.2) +
  geom_line(stat = "density", colour = "red") +
  stat_bin(binwidth = 5,
           geom = "text",
           size = 2.5,
           vjust = 1.5,
           aes(label = format(..count.., big.mark = ",")
              ),
          ) +
  # geom_text(aes(label = format(..count.., big.mark = ",")
  #              ),
  #           size = 3,
  #           nudge_y = (..count.. * 0.1)
  #          ) +
  coord_cartesian(xlim = c(0, 70), ylim = c(0, 0.04)
                 ) +
  #  theme(legend.position="none") +
  labs(title = "Variation in Travel Speed",
       x = "Average Speed (mph)",
       y = "Density"
      )

Speed_HistDen

```


Investigation of SpeedAvg_Mph2.

Histogram of SpeedAvg_Mph2 after removing outlier TravelTime_Sec and TravelDistance_Mi.
```{r}

View(TravDistMiNew_Pctiles)
View(TravTimeHr_Pctiles)

SpeedNoOutlier_HistDen <- ggplot(filter(AllDays_NewTravelDist,
                                        !is.na(SpeedAvg_Mph2) &
                                          TravelDistance_Mi_New > 0.077841005 & # 5th percentile
                                          # TravelDistance_Mi_New < 1.0812500000 & # 99th percentile
                                          TravelTime_Sec > 12.100000 # 4th percentile
                                          # TravelTime_Sec < 293.000000 # 98th percentile
                                       ),
                                 aes(x = SpeedAvg_Mph2,
                                     y = ..density..
                                    )
                                ) +
  geom_histogram(binwidth = 5, fill = "lightblue", colour = "grey60", size = 0.2) +
  geom_line(stat = "density", colour = "red") +
  stat_bin(binwidth = 5,
           geom = "text",
           size = 2.5,
           vjust = 1.5,
           aes(label = format(..count.., big.mark = ",")
              ),
          ) +
  # geom_text(aes(label = format(..count.., big.mark = ",")
  #              ),
  #           size = 3,
  #           nudge_y = (..count.. * 0.1)
  #          ) +
  coord_cartesian(xlim = c(0, 70), ylim = c(0, 0.04)
                 ) +
  #  theme(legend.position="none") +
  labs(title = "Variation in Travel Speed",
       subtitle = "(removed low outliers of Travel Distance and Travel Time)",
       x = "Average Speed (mph)",
       y = "Density"
      )

SpeedNoOutlier_HistDen

```


Investigation of SpeedAvg_Mph2.

New dataset (NoOutliers_TravelDistNTime) when removing outlier low values of TravelDistance_Mi_New and TravelTime_Sec.
```{r}

View(TravDistMiNew_Pctiles)
View(TravTimeHr_Pctiles)

NoOutliers_TravelDistNTime <- filter(AllDays_NewTravelDist,
                                     TravelDistance_Mi_New > .077841005 & # 5th percentile
                                       # TravelDistance_Mi_New < 1.0812500000 & # 99th percentile
                                       TravelTime_Sec > 12.100000 # 4th percentile
                                       # TravelTime_Sec < 293.000000 # 98th percentile
                                    )

nrow(AllDays_NewTravelDist) - nrow(NoOutliers_TravelDistNTime)

str(NoOutliers_TravelDistNTime)
summary(NoOutliers_TravelDistNTime)

```


Investigation of SppedAvg_Mph2.

View(Speed_NoOut_Pctiles):  Aproximately 90% of SpeedAvg_Mph2 values are between ~4mph and ~56mph.
```{r}

Speed_NoOut_Ntile <- as.data.frame(NoOutliers_TravelDistNTime$SpeedAvg_Mph2) %>% 
  mutate(Pctile = ntile(NoOutliers_TravelDistNTime$SpeedAvg_Mph2, 100),
         MinR = min_rank(NoOutliers_TravelDistNTime$SpeedAvg_Mph2),
         PctR = percent_rank(NoOutliers_TravelDistNTime$SpeedAvg_Mph2),
         PctR_Round = round(PctR, 2)
        ) 

colnames(Speed_NoOut_Ntile)[1] <- "SpeedAvg_Mph2"
str(Speed_NoOut_Ntile)

Speed_NoOut_Ntile_Rows <- nrow(Speed_NoOut_Ntile)

View(tail(Speed_NoOut_Ntile, 500))


Speed_NoOut_Pctiles <- group_by(Speed_NoOut_Ntile,
                                PctR_Round
                               ) %>% 
  summarise(
    MinSpeedAtPctile = min(SpeedAvg_Mph2),
    CntsAtPctile = n(),
    PctsAtPctile = CntsAtPctile / Speed_NoOut_Ntile_Rows
  ) %>% 
  mutate(CumSumPAtP = cumsum(PctsAtPctile)
        )

View(Speed_NoOut_Pctiles)

```


Investigation of SppedAvg_Mph2.

Exloring odd/impossible values.
```{r}

# Exploring when SpeedAvg_Mph2 is NA  --  does not occur at all
nrow(filter(NoOutliers_TravelDistNTime,
            is.na(SpeedAvg_Mph2)
           )
    )


# Exploring when SpeedAvg_Mph2 is zero  --  does not occur at all
nrow(filter(NoOutliers_TravelDistNTime,
            SpeedAvg_Mph2 == 0
           )
    )


# examples where SpeedAvg_Mph2 < 3.2848770
View(filter(AllDays_NewTravelDist,
            SpeedAvg_Mph2 > 0 &
              SpeedAvg_Mph2 < 3.2848770
           ) %>% 
       arrange(SpeedAvg_Mph2)
    )

# examples where SpeedAvg_Mph2 < 3.2848770
View(filter(AllDays_NewTravelDist,
            (RowNum_OG >= 485338 & RowNum_OG <= 485358) | # 485348  --  Extreme travel time, Route Change
                (RowNum_OG >= 346952 & RowNum_OG <= 346972) | # 346962  -- Extreme travel time, Route Change 
                (RowNum_OG >= 70494 & RowNum_OG <= 70514) | # 70504  --  Extreme travel time, Route Change
                (RowNum_OG >= 2051846 & RowNum_OG <= 2051866) # 2051856  --  Extreme travel time, Route Change
           )
    )

```


Investigation of SpeedAvg_Mph2.

Limit the dataset based on SpeedAvg_Mph2.
```{r}

NoOutliersSpeed <- filter(NoOutliers_TravelDistNTime,
                          between(SpeedAvg_Mph2,
                                  4.069300, # 5th percentile
                                  56.05651 #95th percentile
                                 )
                          )

nrow(NoOutliers_TravelDistNTime) - nrow(NoOutliersSpeed)

summary(NoOutliersSpeed)

```


TravelTime now looks like it has some odd values on the high end.  So let's look at those.

View(TravTime_NoOut_Pctiles):  Virtually all trips should take less than 5 minutes. (The 99th percentile of of TravelTime is approximately 8 minutes.)
```{r}

TravTime_NoOut_Ntile <- as.data.frame(NoOutliersSpeed$TravelTime_Hr) %>% 
  mutate(Pctile = ntile(NoOutliersSpeed$TravelTime_Hr, 100),
         MinR = min_rank(NoOutliersSpeed$TravelTime_Hr),
         PctR = percent_rank(NoOutliersSpeed$TravelTime_Hr),
         PctR_Round = round(PctR, 2)
        )

colnames(TravTime_NoOut_Ntile)[1] <- "TravelTime_Hr"
str(TravTime_NoOut_Ntile)

TravTime_NoOut_Ntile_Rows <- nrow(TravTime_NoOut_Ntile)

View(tail(TravTime_NoOut_Ntile, 500))


TravTime_NoOut_Pctiles <- group_by(TravTime_NoOut_Ntile,
                                   PctR_Round
                                  ) %>% 
  summarise(
    MinTravTimeHrAtPctile = min(TravelTime_Hr),
    CntsAtPctile = n(),
    PctsAtPctile = CntsAtPctile / TravTime_NoOut_Ntile_Rows
  ) %>% 
  mutate(CumSumPAtP = cumsum(PctsAtPctile),
         MinTravTimeSecAtPctile = MinTravTimeHrAtPctile * (60 * 60)
        )

View(TravTime_NoOut_Pctiles)

```


Investigating odd TravelTime_Sec values.

Trips longer than ~8 minutes.
```{r}

View(filter(NoOutliersSpeed,
            TravelTime_Sec > 491 # min at the 100th percentile
           ) %>% 
       arrange(desc(TravelTime_Sec)
              )
    )

# examples of TravelTime_Sec values that are largest.
View(filter(NoOutliersSpeed,
            (RowNum_OG >= 2071759 & RowNum_OG <= 2071779) | # 2071769  --  results from a route change, and a 3hr+ wait before the new route starts
                (RowNum_OG >= 1473686 & RowNum_OG <= 1473706) | # 1473696  --  results from a route change, and a 3hr wait before the new route starts
                (RowNum_OG >= 1222822 & RowNum_OG <= 1222842) | # 1222832  --  results from a route change, and a 3hr wait before the new route starts
                (RowNum_OG >= 3046089 & RowNum_OG <= 3046109) # 3046099  --  results from a route change, and a 3hr wait before the new route starts
           )
    )


# examples of TravelTime_Sec values that are the smallest of the large.
View(filter(NoOutliersSpeed,
            (RowNum_OG >= 3044689 & RowNum_OG <= 3044709) | # 3044699  --  results from a route change
                (RowNum_OG >= 3022358 & RowNum_OG <= 3022378) | # 3022368  --  results from a route change
                (RowNum_OG >= 2993016 & RowNum_OG <= 2993036) | # 2993026  --  results from a previous route change (change occurred in deleted row)
                (RowNum_OG >= 2683703 & RowNum_OG <= 2683723) # 2683713  --  results from a previous route change (change occurred in deleted row)
           )
    )

```


Let's look at the TravelTime_Sec values and route changes (DirChange2).

The 99th percentile of TravelTime_Sec for both, all trips, and just those trips NOT involving route changes (DirChange2 = "Same"), is approximately 5min (300 sec).

Nota Bene:  The percentile calculation here is defined slightly different than in most of the above analyses (which get the lowest value in the bin created by 100 ntiles).
```{r}

summary(select(NoOutliersSpeed,
               TravelTime_Sec
              )
       )

summary(select(filter(NoOutliersSpeed,
                      DirChange2 == "Same"
                     ),
               TravelTime_Sec
              )
       )

summary(select(filter(NoOutliersSpeed,
                      DirChange2 == "Change"
                     ),
               TravelTime_Sec
              )
       )


TravTimeSec_Qtiles_df <- data.frame(PctValue = seq(0, 100, 1),
                                    All = seq(1, 101, 1),
                                    Same = seq(1, 101, 1),
                                    Change = seq(1, 101, 1)
                                   )

TravTimeSec_Qtiles_df[ , 2] <- quantile(select(NoOutliersSpeed,
                                               TravelTime_Sec
                                              ),
                                        probs = seq(0, 1, 0.01),
                                        na.rm = TRUE
                                       )

TravTimeSec_Qtiles_df[ , 3] <- quantile(select(filter(NoOutliersSpeed,
                                                      DirChange2 == "Same"
                                                     ),
                                               TravelTime_Sec
                                              ),
                                        probs = seq(0, 1, 0.01),
                                        na.rm = TRUE
                                       )

TravTimeSec_Qtiles_df[ , 4] <- quantile(select(filter(NoOutliersSpeed,
                                                      DirChange2 == "Change"
                                                     ),
                                               TravelTime_Sec
                                              ),
                                        probs = seq(0, 1, 0.01),
                                        na.rm = TRUE
                                       )

View(TravTimeSec_Qtiles_df)

```


Limit the dataset now based on TravelTime_Sec.
```{r}

UpperLimitTravTime <- filter(NoOutliersSpeed,
                             TravelTime_Sec <= 491 # min at the 100th percentile
                             )

nrow(NoOutliersSpeed) - nrow(UpperLimitTravTime)

str(UpperLimitTravTime)

summary(UpperLimitTravTime)

```


Investigation of Dwell_Time2 (how long the bus is at a stop).

Differences between Dwell_Time (by WMATA) and Dwell_Time2 (by me) appear to be due to switches in RouteAlt. WMATA calculates Dwell_Time by an unknown process. The WMATA calculation is equal to my calculation, except for the records immedaitely before and after a RouteAlt switch (DirChange2).
```{r}

View(filter(AllDays_NewOrder,
            Dwell_Time != Dwell_Time2
           )
    )


# Examples where the Dwell_Time and Dwell_Time2 are different
View(filter(AllDays_NewOrder,
            ( (RowNum_OG >= 65 & RowNum_OG <= 85) | # 75
                (RowNum_OG >= 162 & RowNum_OG <= 192) | # 172
                (RowNum_OG >= 431952 & RowNum_OG <= 431972) | # 431962
                (RowNum_OG >= 434595 & RowNum_OG <= 434615) # 434605  --  this record is NOT a route switch, but does has a Sequence switch (Me: should there really be a route switch here?)
            )
           )
    )

```


Investigation of Dwell_Time2 (how long the bus is at a stop).

First, create some "rank" stats.
View(DT2_Pctiles): 95% of Dwell_Time2s are <= 23 seconds...but some weird (e.g., nearly 2 hour Dwell_Time2s exist).
```{r}

DwellTime2_Ntile <- as.data.frame(AllDays_NewOrder$Dwell_Time2) %>% 
  mutate(Pctile = ntile(AllDays_NewOrder$Dwell_Time2, 100),
         MinR = min_rank(AllDays_NewOrder$Dwell_Time2),
         PctR = percent_rank(AllDays_NewOrder$Dwell_Time2),
         PctR_Round = round(PctR, 2)
        ) 

colnames(DwellTime2_Ntile)[1] <- "Dwell_Time2"
str(DwellTime2_Ntile)

DwellTime2_Ntile_Rows <- nrow(DwellTime2_Ntile)

View(tail(DwellTime2_Ntile, 500))


DwellTime2_Pctiles <- group_by(DwellTime2_Ntile,
                               PctR_Round
                              ) %>% 
  summarise(
    MinDwellAtPctile = min(Dwell_Time2),
    CntsAtPctile = n(),
    PctsAtPctile = CntsAtPctile / DwellTime2_Ntile_Rows
  ) %>% 
  mutate(CumSumPAtP = cumsum(PctsAtPctile)
        )

View(DwellTime2_Pctiles)

```


Investigation of Dwell_Time2 (how long the bus is at a stop).

Histogram of Dwell_Time2.
```{r}

DwellTime2_HistDen <- ggplot(AllDays_NewOrder, aes(x = Dwell_Time2, y = ..density..)) +
  geom_histogram(binwidth = 1, fill = "lightblue", colour = "grey60", size = 0.2) +
  geom_line(stat = "density", colour = "red") +
  coord_cartesian(xlim = c(1, 25), ylim = c(0, 0.05)
                 ) +
  xlab("Time a Bus Stays at a Stop (sec)") + 
  ylab("Density") + 
  #  theme(legend.position="none") + 
  ggtitle(expression(atop("Variation in How Long a Bus Stays at a Stop"
                          # ,atop(italic("xxxxx"),"")
                         )
                    )
         )

DwellTime2_HistDen

```


Investigation of Dwell_Time2 (how long the bus is at a stop).

Looking at some weirdly long Dwell_Time2 values.
```{r}

View(arrange(AllDays_NewOrder,
             desc(Dwell_Time2)
            )
    )


# examples of extremely large Dwell_Time2s
View(filter(AllDays_NewOrder,
            (RowNum_OG >= 292669 & RowNum_OG <= 292689) | # 292679
                (RowNum_OG >= 531057 & RowNum_OG <= 531077) | # 531067
                (RowNum_OG >= 1388627 & RowNum_OG <= 1388647) | # 1388637
                (RowNum_OG >= 1645711 & RowNum_OG <= 1645731) # 1645721
           )
    )


View(filter(AllDays_NewOrder,
            Dwell_Time2 == 0
           )
    )

```


Investigation of Delta_Time (how early or late the bus is).

View(DT2_Pctiles): 94% of Delta_Time values are between -236 seconds and 1,259 seconds. Roughly 66% of records are within 5 min late and 5 min early...but some weird (e.g., almost 50 minute late or 40 minute early) Delta_Times exist.

Note that Delta_Time is the difference from the scheduled bus arrival. So if two buses are scheduled to arrive at a destination at 10:00pm and 10:20pm, and if the 10:20pm bus has a Delta_Time of 5 minutes, there are 25 minutes between bus arrivals at the stop.

Also note that based on a comment at https://planitmetro.com/2016/11/16/data-download-metrobus-vehicle-location-data/, the Delta_Time values don't appear to coincide with published bus schedules (e.g., the X2 departing every 8 minutes during peak hours).
```{r}

DeltTime_Ntile <- as.data.frame(AllDays_NewOrder$Delta_Time) %>% 
  mutate(Pctile = ntile(AllDays_NewOrder$Delta_Time, 100),
         MinR = min_rank(AllDays_NewOrder$Delta_Time),
         PctR = percent_rank(AllDays_NewOrder$Delta_Time),
         PctR_Round = round(PctR, 2)
        ) 

colnames(DeltTime_Ntile)[1] <- "Delta_Time"
str(DeltTime_Ntile)

DeltTime_Ntile_Rows <- nrow(DeltTime_Ntile)

View(tail(DeltTime_Ntile, 500))


DeltTime_Pctiles <- group_by(DeltTime_Ntile,
                             PctR_Round
                            ) %>% 
  summarise(
    MinDeltTimeAtPctile = min(Delta_Time),
    CntsAtPctile = n(),
    PctsAtPctile = CntsAtPctile / DeltTime_Ntile_Rows
  ) %>% 
  mutate(CumSumPAtP = cumsum(PctsAtPctile)
        )

View(DeltTime_Pctiles)
DeltTime_Pctiles

# ~66% of rows are between 5 min late and 5 min early
nrow(filter(AllDays_NewOrder,
            Delta_Time >= -300 &
              Delta_Time <= 300
           )
    ) / nrow(AllDays_NewOrder)


# examples of weird large Delta_Times
View(filter(AllDays_NewOrder,
            Delta_Time < -4202 |
              Delta_Time > 1705
           ) %>% 
       arrange(desc(Delta_Time)
              )
    )

```


Investigation of Delta_Time (how early or late the bus is).

Delta_Time histogram.
```{r}

DeltTime_HistDen <- ggplot(AllDays_NewOrder, aes(x = (Delta_Time / 60),
                                                 y = ..density..
                                                )
                          ) +
  geom_histogram(binwidth = (5/60), fill = "lightblue", colour = "grey60", size = 0.2) +
  geom_line(stat = "density", colour = "red") +
  coord_cartesian(xlim = c(-5, 5)) +
  xlab("Bus Lateness (min)") + 
  ylab("Density") + 
  #  theme(legend.position="none") + 
  ggtitle(expression(atop("Variation in How Early/Late a Bus Is",
                          atop(italic("(positive values are late arrivals)"),
                               ""
                              )
                         )
                    )
         )

DeltTime_HistDen

```


Investigation of Delta_Time (how early or late the bus is).

Delta_Time boxplot.
```{r}

# Count_Values is needed to display the medians on the box plots
Count_Values <- ddply(AllDays_NewOrder,
                      .(Event_Time_HrGroup),
                      summarise,
                      Value_Counts = median(Delta_Time / 60, na.rm = TRUE)
                     )

DeltTime_BoxPlot <- ggplot(AllDays_NewOrder,
                           aes(factor(Event_Time_HrGroup),
                               Delta_Time / 60,
                               fill = factor(Event_Time_HrGroup)
                              )
                          ) + 
  geom_boxplot(outlier.colour="red", notch=TRUE) + 
  # coord_cartesian(ylim = c(-300, 1200)) +
  coord_cartesian(ylim = c(-5, 20)) +
  geom_text(data = Count_Values,
            aes(y = Value_Counts,
                label = format(round(Value_Counts, digits = 1),
                               nsmall = 1
                              )
               ),
            size = 3,
            vjust = -0.5
           ) +
  xlab("Hour Group") + 
  ylab("Bus Lateness (minutes)") + 
  theme(legend.position="none", axis.text.x = element_text(angle=45)) + 
  #theme(legend.position="right", axis.text.x = element_blank()) + 
  ggtitle(expression(atop("How Early/Late is the Bus (by Hour Group)",
                          atop(italic("(positive values are late arrivals)"),
                               ""
                              )
                         )
                    )
         )

DeltTime_BoxPlot

```


Investigation of Delta_Time (how early or late the bus is).

Exploring "extreme" Delta_Times.  First let's get some "rank" stats.
```{r}

View(DeltTime_Pctiles)
DeltTime_Pctiles


DeltTimeAbs_Ntile <- as.data.frame(abs(AllDays_NewOrder$Delta_Time)) %>% 
  mutate(Pctile = ntile(abs(AllDays_NewOrder$Delta_Time), 100),
         MinR = min_rank(abs(AllDays_NewOrder$Delta_Time)),
         PctR = percent_rank(abs(AllDays_NewOrder$Delta_Time)),
         PctR_Round = round(PctR, 2)
        ) 

colnames(DeltTimeAbs_Ntile)[1] <- "Delta_Time_Abs"
str(DeltTimeAbs_Ntile)

DeltTimeAbs_Ntile_Rows <- nrow(DeltTimeAbs_Ntile)

View(tail(DeltTimeAbs_Ntile, 500))


DeltTimeAbs_Pctiles <- group_by(DeltTimeAbs_Ntile,
                                PctR_Round
                               ) %>% 
  summarise(
    MinDeltTimeAtPctile = min(Delta_Time_Abs),
    CntsAtPctile = n(),
    PctsAtPctile = CntsAtPctile / DeltTime_Ntile_Rows
  ) %>% 
  mutate(CumSumPAtP = cumsum(PctsAtPctile)
        )

View(DeltTimeAbs_Pctiles)
DeltTimeAbs_Pctiles

```


Investigation of Delta_Time (how early or late the bus is).

Exploring "extreme" Delta_Times.  Then let's calculate the percentage of buses that are 10 minutes (or more) late/early.
```{r}

HrGroup_DeltaTime_All <- group_by(AllDays_NewOrder,
                                  Event_Time_HrGroup
                                 ) %>% 
  summarise(EventAll_Cnt = n()
           )

str(HrGroup_DeltaTime_All)
View(HrGroup_DeltaTime_All)


HrGroup_DeltaTime_Above10Min <- filter(AllDays_NewOrder,
                                       abs(Delta_Time) >= 600
                                      ) %>% 
  group_by(Event_Time_HrGroup) %>% 
  summarise(EventAbove10_Cnt = n()
           )

str(HrGroup_DeltaTime_Above10Min)
View(HrGroup_DeltaTime_Above10Min)


HrGroup_DeltaTimeCompare <- inner_join(HrGroup_DeltaTime_Above10Min,
                                       HrGroup_DeltaTime_All,
                                       by = c("Event_Time_HrGroup" = "Event_Time_HrGroup")
                                      ) %>% 
  mutate(PctEventsAbove10 = EventAbove10_Cnt / EventAll_Cnt)

View(HrGroup_DeltaTimeCompare)

```


Investigation of Delta_Time (how early or late the bus is).

Quickly plot these "extreme" Delta_Times. 
```{r}

DeltTime_Above10_Cols <- ggplot(HrGroup_DeltaTimeCompare,
                                aes(factor(Event_Time_HrGroup),
                                    PctEventsAbove10
                                   )
                               ) +
  geom_col(fill = "lightblue", colour = "grey60", size = 0.2) +
  geom_text(aes(label = format(round(PctEventsAbove10, digits = 2),
                               nsmall = 2
                              )
               ),
            size = 3,
            nudge_y = (HrGroup_DeltaTimeCompare$PctEventsAbove10 * -0.1)
           ) +
  # coord_cartesian(xlim = c(-5, 5)) +
  xlab("Hour Group") + 
  ylab("Percent of All Bus Arrivals") +
  theme(legend.position="none", axis.text.x = element_text(angle=45)) +
  ggtitle(expression(atop("When is a Bus 10+ Minutes Late/Early"
                          # ,atop(italic("positive values are late arrivals"),
                          #      ""
                          #     )
                         )
                    )
         )

DeltTime_Above10_Cols

```


Quick investigation on the relationship between Dwell_Time2 (the time a bus is at a stop) and Delta_Time (how early/late the bus is).

Correlation.
```{r}

DwellTDeltaT_Corr <- as.matrix(cor(x = AllDays_NewOrder$Dwell_Time2,
                                   y = AllDays_NewOrder$Delta_Time,
                                   use = "pairwise"
                                  )
                               )

DwellTDeltaT_Corr

```


Quick investigation on the relationship between Dwell_Time2 (the time a bus is at a stop) and Delta_Time (how early/late the bus is).

Next, let's get a sample of data for plotting. Let's do this for the full dataset (AllDays_NewOrder).
```{r}

AllDays_NewOrder_10PctSamp <- sample_frac(AllDays_NewOrder, 0.1) %>% 
  select(Delta_Time,
         Dwell_Time2
        ) %>% 
  mutate(DataSet = "AllData")

str(AllDays_NewOrder_10PctSamp)

```


Quick investigation on the relationship between Dwell_Time2 (the time a bus is at a stop) and Delta_Time (how early/late the bus is).

Let's also get a sample of data for plotting, but with a datset that removes outliers.
```{r}

View(DeltTime_Pctiles)
View(DwellTime2_Pctiles)

AllDays_NewOrder_NoExtremes_10PctSamp <- filter(AllDays_NewOrder,
                                                between(Delta_Time, -402, 1705) & # removes about 2% of Delta_Time values
                                                  between(Dwell_Time2, 1, 63)  # removes about 2% of Dwell_Time2 values
                                               ) %>% 
  sample_frac(0.1) %>% 
  select(Delta_Time,
         Dwell_Time2
        ) %>% 
  mutate(DataSet = "OutliersRemoved")

str(AllDays_NewOrder_NoExtremes_10PctSamp)

```


Quick investigation on the relationship between Dwell_Time2 (the time a bus is at a stop) and Delta_Time (how early/late the bus is).

Plotting the data from the dataset that does not remove outliers.
```{r}

DwellTDeltaT_Scatter <- ggplot(AllDays_NewOrder_10PctSamp,
                               aes(Dwell_Time2, Delta_Time)
                              ) +
  geom_point(shape = 1, alpha = 0.5) +
  scale_shape(solid = FALSE) +
  geom_smooth(method = "lm", colour = "red") +
  # xlab("Time at Stop (sec)") + 
  # ylab("Lateness (sec)") +
  annotate(label = lm_eqn(df = AllDays_NewOrder_10PctSamp,
                          y = AllDays_NewOrder_10PctSamp$Delta_Time,
                          x = AllDays_NewOrder_10PctSamp$Dwell_Time2
                         ),
           x = 2200,
           y = 600,
           geom = "text",
           size = 3,
           colour = "red",
           parse = TRUE
          ) +
  labs(title = "Lateness vs Time at Stop",
       subtitle = "(no outliers removed)",
       x = "Time at Stop (sec)",
       y = "Lateness (sec)"
      )
  # ggtitle(expression(atop("Lateness vs Time at Stop"
  #                         ,atop(italic("(no outliers removed)"),
  #                               ""
  #                              )
  #                        )
  #                   )
  #        )
# +
#   geom_jitter()

DwellTDeltaT_Scatter

```


Quick investigation on the relationship between Dwell_Time2 (the time a bus is at a stop) and Delta_Time (how early/late the bus is).

Plotting the data from the dataset that does remove outliers.
```{r}

DwellTDeltaT_Scatter_NoExtremes <- ggplot(AllDays_NewOrder_NoExtremes_10PctSamp,
                                          aes(Dwell_Time2, Delta_Time)
                                         ) +
  geom_point(shape = 1, alpha = 0.5) +
  scale_shape(solid = FALSE) +
  geom_smooth(method = "lm", colour = "blue") +
  # xlab("Time at Stop (sec)") + 
  # ylab("Lateness (sec)") +
  annotate(label = lm_eqn(df = AllDays_NewOrder_NoExtremes_10PctSamp,
                          y = AllDays_NewOrder_NoExtremes_10PctSamp$Delta_Time,
                          x = AllDays_NewOrder_NoExtremes_10PctSamp$Dwell_Time2
                         ),
           x = 50,
           y = -475,
           geom = "text",
           size = 3,
           colour = "blue",
           parse = TRUE
          ) +
  labs(title = "Lateness vs Time at Stop",
       subtitle = "(2% of outliers removed)",
       x = "Time at Stop (sec)",
       y = "Lateness (sec)"
      )
  # ggtitle(expression(atop("Lateness vs Time at Stop"
  #                         ,atop(italic("(2% of outliers removed)"),
  #                               ""
  #                              )
  #                        )
  #                   )
  #        )
# +
#   geom_jitter()

DwellTDeltaT_Scatter_NoExtremes

```


Quick investigation on the relationship between Dwell_Time2 (the time a bus is at a stop) and Delta_Time (how early/late the bus is).

Plotting the data from both datasets together.
```{r}

CombinedData <- rbind(AllDays_NewOrder_10PctSamp,
                      AllDays_NewOrder_NoExtremes_10PctSamp
                     )

CombinedData$DataSet <- factor(CombinedData$DataSet)

str(CombinedData)


DwellTDeltaT_Scatter_Combined <- ggplot(CombinedData,
                                        aes(x = Dwell_Time2,
                                            y = Delta_Time,
                                            colour = DataSet
                                           )
                                       ) +
  geom_point(shape = 1, alpha = 0.5) +
  scale_shape(solid = FALSE) +
  coord_cartesian(xlim = c(0, 500), ylim = c(-1000, 2000)
                 ) +
  geom_smooth(data = filter(CombinedData,
                            DataSet == "AllData"
                           ),
              method = "lm",
              colour = "red"
             ) +
  geom_smooth(data = filter(CombinedData,
                            DataSet == "OutliersRemoved"
                           ),
              method = "lm",
              colour = "blue"
             ) +
  # facet_wrap( ~ DataSet, ncol = 2) +
  annotate(label = lm_eqn(df = AllDays_NewOrder_10PctSamp,
                          y = AllDays_NewOrder_10PctSamp$Delta_Time,
                          x = AllDays_NewOrder_10PctSamp$Dwell_Time2
                         ),
           x = 300,
           y = -600,
           geom = "text",
           size = 3,
           colour = "red",
           parse = TRUE
          ) +
  annotate(label = lm_eqn(df = AllDays_NewOrder_NoExtremes_10PctSamp,
                          y = AllDays_NewOrder_NoExtremes_10PctSamp$Delta_Time,
                          x = AllDays_NewOrder_NoExtremes_10PctSamp$Dwell_Time2
                         ),
           x = 300,
           y = -800,
           geom = "text",
           size = 3,
           colour = "blue",
           parse = TRUE
          ) +
  theme(legend.position = "bottom") +
  labs(title = "Lateness vs Time at Stop",
       x = "Time at Stop (sec)",
       y = "Lateness (sec)"
      )
  # ggtitle(expression(atop("Lateness vs Time at Stop"
                          # ,atop(italic("2% of outliers removed"),
                          #       ""
                          #      )
         #                 )
         #            )
         # )
# +
#   geom_jitter()

DwellTDeltaT_Scatter_Combined

```











Add a new chunk by clicking the *Insert Chunk* button on the toolbar or by pressing *Cmd+Option+I*.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the *Preview* button or press *Cmd+Shift+K* to preview the HTML file).
